Python error示例，starcluster.logger.log.error Python示例

示例#1

0

显示文件

文件： glusterfsencrypt.py 项目： rartzi/raAWSLab

	def _mount_volume(self, node, volume, mount_path):
		if volume.attach_data.device == None:
			log.error("Volume %s has not been attached" % volume.id)
			return

		device = volume.attach_data.device.replace("sd","xvd")
		self._mount_device(node, device, mount_path)

示例#2

0

显示文件

文件： cli.py 项目： agua/StarCluster

    def main(self):
        # Create global options parser.
        self.gparser = gparser = self.create_global_parser()
        # Declare subcommands.
        subcmds = commands.all_cmds
        # subcommand completions
        scmap = {}
        for sc in subcmds:
            for n in sc.names:
                scmap[n] = sc

        if optcomplete:
            listcter = optcomplete.ListCompleter(scmap.keys())
            subcter = optcomplete.NoneCompleter()
            optcomplete.autocomplete(
                gparser, listcter, None, subcter, subcommands=scmap)
        elif 'COMP_LINE' in os.environ:
            return -1

        gopts, sc, opts, args = self.parse_subcommands(gparser, subcmds)
        if args and args[0] == 'help':
            sc.parser.print_help()
            sys.exit(0)
        try:
            sc.execute(args)
        except (EC2ResponseError, S3ResponseError, BotoServerError), e:
            log.error("%s: %s" % (e.error_code, e.error_message))
            sys.exit(1)

示例#3

0

显示文件

文件： cli.py 项目： AlexMikhalev/StarCluster

    def parse_subcommands(self, gparser=None):
        """
        Parse global arguments, find subcommand from list of subcommand
        objects, parse local subcommand arguments and return a tuple of
        global options, selected command object, command options, and
        command arguments.

        Call execute() on the command object to run. The command object has
        members 'gopts' and 'opts' set for global and command options
        respectively, you don't need to call execute with those but you could
        if you wanted to.
        """
        gparser = gparser or self.gparser
        # parse global options.
        gopts, args = gparser.parse_args()
        if not args:
            gparser.print_help()
            raise SystemExit("\nError: you must specify an action.")
        # set debug level if specified
        if gopts.DEBUG:
            console.setLevel(logger.DEBUG)
            config.DEBUG_CONFIG = True
        # load StarClusterConfig into global options
        try:
            cfg = config.StarClusterConfig(gopts.CONFIG)
            cfg.load()
        except exception.ConfigNotFound, e:
            log.error(e.msg)
            e.display_options()
            sys.exit(1)

示例#4

0

显示文件

文件： main_w2io.py 项目： Pablites/W2IO

 def __local_mpe_instalation(self):    
     try:
         log.info('W2IO: local MPE instalation')
         call(self.__remote_module_path + '/bin/w2io-instalation.sh 2>&1 > ' + self.__remote_module_path + '/log/w2io-instalation.log') 
     except:
         log.error('W2IO: local MPE instalation error')
         sys.exit('W2IO: local MPE instalation error')

示例#5

0

显示文件

文件： main_w2io.py 项目： Pablites/W2IO

 def __local_dependency_instalation(self):    
     try:
         log.info('W2IO: local Dependency instalation')
         call(self.__remote_module_path + '/bin/w2io-dependency.sh 2>&1 > ' + self.__remote_module_path + '/log/w2io-dependency.log') 
     except:
         log.error('W2IO: local Dependency instalation error')
         sys.exit('W2IO: local Dependency instalation error')

示例#6

0

显示文件

文件： main_w2io.py 项目： Pablites/W2IO

 def __local_fix_fetch(self):
     try:
         log.info('W2IO: local Fixing fetch problem')
         call(self.__remote_module_path + '/bin/fetch-problem.sh 2>&1 > ' + self.__remote_module_path + '/log/fetch-problem.log') 
     except:
         log.error('W2IO: local Fixing fetch problem error')
         sys.exit('W2IO: local Fixing fetch problem error')

示例#7

0

显示文件

文件： main_w2io.py 项目： Pablites/W2IO

 def __mpe_instalation(self, node):    
     try:
         log.info('W2IO: ' + node.alias + ' MPE instalation')
         node.ssh.execute(self.__remote_module_path + '/bin/w2io-instalation.sh 2>&1 > ' + self.__remote_module_path + '/log/w2io-instalation.log') 
     except:
         log.error('W2IO: ' + node.alias + ' MPE instalation error')
         sys.exit('W2IO: ' + node.alias + ' MPE instalation error')

示例#8

0

显示文件

文件： sge.py 项目： cariaso/StarCluster

 def recover(self, nodes, master, user, user_shell, volumes):
     cmd = "ps -ef | grep sge_qmaster | grep -v grep | wc -l"
     rez = int(master.ssh.execute(cmd)[0])
     if rez == 0:
         log.error("sge_qmaster is down")
         cmd = "cd /opt/sge6/bin/linux-x64/ && ./sge_qmaster"
         master.ssh.execute(cmd)

示例#9

0

显示文件

文件： ipclustersupervisord.py 项目： corydolphin/StarClusterReliableIPCluster

    def __init__(self, enable_notebook=False, notebook_passwd=None,
                 notebook_directory=None, packer=None, n_engines_per_node=None,
                 n_engines_master=None,
                 hub_db_class='IPython.parallel.controller.dictdb.NoDB', log_level='INFO'):
        super(ReliableIPCluster, self).__init__()
        if isinstance(enable_notebook, basestring):
            self.enable_notebook = enable_notebook.lower().strip() == 'true'
        else:
            self.enable_notebook = enable_notebook
        self.notebook_passwd = notebook_passwd or utils.generate_passwd(16)
        self.notebook_directory = notebook_directory

        self.hub_db_class = hub_db_class
        if n_engines_per_node is None:
            self.n_engines_per_node = None
        else:
            self.n_engines_per_node = int(n_engines_per_node)

        if n_engines_master is None:
            self.n_engines_master = None
        else:
            self.n_engines_master = int(n_engines_master)

        self.log_level = log_level
        if packer not in (None, 'json', 'pickle', 'msgpack'):
            log.error("Unsupported packer: %s", packer)
            self.packer = None
        else:
            self.packer = packer

示例#10

0

显示文件

文件： pvfs_controller.py 项目： Pablites/PabPlug

 def dependency_instalation(self, node):    
     try:
         log.info('PabPlug: ' + node.alias + ' Dependency instalation')
         node.ssh.execute(self.__remote_module_path + '/bin/pvfs-dependency.sh 2>&1 > ' + self.__remote_module_path + '/log/pvfs-dependency.log') 
     except Exception as e:
         log.error('PabPlug: ' + node.alias + ' Dependency instalation error: '+str(e))
         sys.exit('PabPlug: ' + node.alias + ' Dependency instalation error: '+str(e))

示例#11

0

显示文件

文件： volume.py 项目： jahangir123/StarCluster

 def create(self, volume_size, volume_zone, name=None, tags=None):
     try:
         self.validate(volume_size, volume_zone, self._device)
         instance = self._request_instance(volume_zone)
         self._validate_required_progs([self._mkfs_cmd.split()[0]])
         self._determine_device()
         vol = self._create_volume(volume_size, volume_zone)
         if tags:
             for tag in tags:
                 tagval = tags.get(tag)
                 tagmsg = "Adding volume tag: %s" % tag
                 if tagval:
                     tagmsg += "=%s" % tagval
                 log.info(tagmsg)
                 vol.add_tag(tag, tagval)
         if name:
             vol.add_tag("Name", name)
         self._attach_volume(self._volume, instance.id, self._device)
         self._format_volume()
         self.shutdown()
         self._warn_about_volume_hosts()
         self.log.info("Your new %sGB volume %s has been created "
                       "successfully" % (volume_size, vol.id))
         return vol
     except Exception:
         self.log.error("failed to create new volume")
         if self._volume:
             log.error(
                 "Error occured. Detaching, and deleting volume: %s" % \
                 self._volume.id)
             self._volume.detach(force=True)
             time.sleep(5)
             self._volume.delete()
         self._warn_about_volume_hosts()
         raise

示例#12

0

显示文件

文件： cluster.py 项目： samof76/StarCluster

 def _validate(self, validate_running=False):
     """
     Checks that all cluster template settings are valid. Raises
     a ClusterValidationError exception if not. Passing
     validate_running=True will also check that the existing instances
     properties match the configuration of this cluster template.
     """
     log.info("Validating cluster template settings...")
     self._has_all_required_settings()
     self._validate_spot_bid()
     self._validate_cluster_size()
     self._validate_shell_setting()
     self._validate_permission_settings()
     self._validate_credentials()
     self._validate_keypair()
     self._validate_zone()
     self._validate_ebs_settings()
     self._validate_ebs_aws_settings()
     self._validate_image_settings()
     self._validate_instance_types()
     if validate_running:
         log.info("Validating existing instances...")
         try:
             self._validate_running_instances()
         except exception.ClusterValidationError, e:
             log.error("existing instances are not compatible with cluster" + " template settings:")
             raise

示例#13

0

显示文件

文件： cli.py 项目： godber/StarCluster

    def main(self):
        # Create global options parser.
        self.gparser = gparser = self.create_global_parser()
        # Declare subcommands.
        subcmds = all_cmds
        # subcommand completions
        scmap = {}
        for sc in subcmds:
            for n in sc.names:
                scmap[n] = sc

        if optcomplete:
            listcter = optcomplete.ListCompleter(scmap.keys())
            subcter = optcomplete.NoneCompleter()
            optcomplete.autocomplete(
                gparser, listcter, None, subcter, subcommands=scmap)
        elif 'COMP_LINE' in os.environ:
            return -1

        gopts, sc, opts, args = self.parse_subcommands(gparser, subcmds)
        if args and args[0] == 'help':
            sc.parser.print_help()
            sys.exit(0)
        try:
            sc.execute(args)
        except exception.BaseException, e:
            lines = e.msg.splitlines()
            for l in lines:
                log.error(l)
            sys.exit(1)

示例#14

0

显示文件

文件： tagger.py 项目： JohnCEarls/starcluster-plugins

 def get_value(self, value, node):
     """
     Handle variables
     [[date]]
     [[alias]] - node name
     [[master]] - name of this nodes master
     [[localuser]] - user name of person that started cluster, 
         according to machine cluster started from
     """
     auto_pattern = r"\[\[(.+)\]\]"
     auto_v = re.match(auto_pattern, value)
     if auto_v:
         command = auto_v.group(1).strip()
         if command == "date":
             return datetime.utcnow().strftime("%c UTC")
         if command == "alias":
             return node.alias
         if command == "master":
             return master.alias
         if command == "localuser":
             return getpass.getuser()
         log.error(
             ("Tagging: <%s> appears to be a patterned tag, but " "no command found. Tagging as <%s>.")
             % (value, command)
         )
         return command
     else:
         return value

示例#15

0

显示文件

文件： awsutils.py 项目： mresnick/StarCluster

 def get_spot_history(self, instance_type, start=None, end=None, plot=False):
     if not utils.is_iso_time(start):
         raise exception.InvalidIsoDate(start)
     if not utils.is_iso_time(end):
         raise exception.InvalidIsoDate(end)
     hist = self.conn.get_spot_price_history(start_time=start, 
                                     end_time=end,
                                     instance_type=instance_type, 
                                     product_description="Linux/UNIX")
     if not hist:
         raise exception.SpotHistoryError(start,end)
     dates = [ utils.iso_to_datetime_tuple(i.timestamp) for i in hist]
     prices = [ i.price for i in hist ]
     maximum = max(prices)
     avg = sum(prices)/len(prices)
     log.info("Current price: $%.2f" % hist[-1].price)
     log.info("Max price: $%.2f" % maximum)
     log.info("Average price: $%.2f" % avg)
     if plot:
         try:
             import pylab
             pylab.plot_date(pylab.date2num(dates), prices, linestyle='-') 
             pylab.xlabel('date')
             pylab.ylabel('price (cents)')
             pylab.title('%s Price vs Date (%s - %s)' % (instance_type, start, end))
             pylab.grid(True)
             pylab.show()
         except ImportError,e:
             log.error("Error importing pylab:")
             log.error(str(e)) 
             log.error("please check that matplotlib is installed and that:")
             log.error("   $ python -c 'import pylab'")
             log.error("completes without error")

示例#16

0

显示文件

文件： awsutils.py 项目： mresnick/StarCluster

 def list_bucket(self, bucketname):
     bucket = self.get_bucket_or_none(bucketname)
     if bucket:
         for file in bucket.list():
             if file.name: print file.name
     else:
         log.error('bucket %s does not exist' % bucketname)

示例#17

0

显示文件

文件： sshutils.py 项目： FinchPowers/StarCluster

    def execute(self, command, silent=True, only_printable=False,
                ignore_exit_status=False, log_output=True, detach=False,
                source_profile=True, raise_on_failure=True):
        """
        Execute a remote command and return stdout/stderr

        NOTE: this function blocks until the process finishes

        kwargs:
        silent - don't print the command's output to the console
        only_printable - filter the command's output to allow only printable
                         characters
        ignore_exit_status - don't warn about non-zero exit status
        log_output - log all remote output to the debug file
        detach - detach the remote process so that it continues to run even
                 after the SSH connection closes (does NOT return output or
                 check for non-zero exit status if detach=True)
        source_profile - if True prefix the command with "source /etc/profile"
        raise_on_failure - raise exception.SSHError if command fails
        returns List of output lines
        """
        channel = self.transport.open_session()
        if detach:
            command = "nohup %s &" % command
            if source_profile:
                command = "source /etc/profile && %s" % command
            channel.exec_command(command)
            channel.close()
            self.__last_status = None
            return
        if source_profile:
            command = "source /etc/profile && %s" % command
        log.debug("executing remote command: %s" % command)
        channel.exec_command(command)
        output = self._get_output(channel, silent=silent,
                                  only_printable=only_printable)
        exit_status = channel.recv_exit_status()
        self.__last_status = exit_status
        out_str = '\n'.join(output)
        if exit_status != 0:
            msg = "remote command '%s' failed with status %d"
            msg %= (command, exit_status)
            if log_output:
                msg += ":\n%s" % out_str
            else:
                msg += " (no output log requested)"
            if not ignore_exit_status:
                if raise_on_failure:
                    raise exception.RemoteCommandFailed(
                        msg, command, exit_status, out_str)
                else:
                    log.error(msg)
            else:
                log.debug("(ignored) " + msg)
        else:
            if log_output:
                log.debug("output of '%s':\n%s" % (command, out_str))
            else:
                log.debug("output of '%s' has been hidden" % command)
        return output

示例#18

0

显示文件

文件： tagger.py 项目： rartzi/raAWSLab

    def get_value(self, value, node):
        """
        Handle special values
        [[date]]
        [[alias]] - node name
        [[localuser]] - user name of person that started cluster, 
            according to machine cluster started from
        """
        auto_pattern = r'\[\[(.+)\]\]'
        auto_v = re.match(auto_pattern, value)
        if auto_v:
            special_value = auto_v.group(1).strip()
            if special_value == 'date':
                return datetime.utcnow().strftime('%c UTC')
            if special_value == 'alias':
                return node.alias
#            if special_value == 'master':
#                return master.alias
            if special_value == 'localuser':
                return getpass.getuser()
            log.error(("Tagging: <%s> appears to be a patterned tag, but " 
                    "no special_value found. Tagging as <%s>.") % 
                    (value, special_value) )
            return special_value
        else:
            return value

示例#19

0

显示文件

文件： volume.py 项目： inteq/StarCluster

 def is_valid(self, size, zone, device):
     try:
         self.validate(size, zone, device)
         return True
     except exception.BaseException, e:
         log.error(e.msg)
         return False

示例#20

0

显示文件

文件： completers.py 项目： jahangir123/StarCluster

 def _completer(self):
     try:
         rimages = self.ec2.registered_images
         completion_list = [i.id for i in rimages]
         return optcomplete.ListCompleter(completion_list)
     except Exception, e:
         log.error('something went wrong fix me: %s' % e)

示例#21

0

显示文件

文件： volume.py 项目： mresnick/StarCluster

 def is_valid(self, size, zone, device, image):
     try:
         self.validate(size, zone, device, image)
         return True
     except exception.ValidationError,e:
         log.error(e.msg)
         return False

示例#22

0

显示文件

文件： __init__.py 项目： ricrogz/StarCluster

 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     num_nodes = len(self._cluster.nodes)
     if num_nodes <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)"
                  % self.min_nodes)
         return
     max_remove = num_nodes - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = utils.get_utc_now()
         except Exception:
             log.error("Failed to remove node %s" % node.alias,
                       exc_info=True)

示例#23

0

显示文件

文件： volume.py 项目： ricrogz/StarCluster

 def create(self, volume_size, volume_zone, name=None, tags=None):
     try:
         self.validate(volume_size, volume_zone, self._aws_block_device)
         instance = self._request_instance(volume_zone)
         self._validate_required_progs([self._mkfs_cmd.split()[0]])
         self._determine_device()
         vol = self._create_volume(volume_size, volume_zone)
         if tags:
             for tag in tags:
                 tagval = tags.get(tag)
                 tagmsg = "Adding volume tag: %s" % tag
                 if tagval:
                     tagmsg += "=%s" % tagval
                 log.info(tagmsg)
                 vol.add_tag(tag, tagval)
         if name:
             vol.add_tag("Name", name)
         self._attach_volume(self._volume, instance.id,
                             self._aws_block_device)
         self._get_volume_device(self._aws_block_device)
         self._format_volume()
         self.shutdown()
         log.info("Your new %sGB volume %s has been created successfully" %
                  (volume_size, vol.id))
         return vol
     except Exception:
         log.error("Failed to create new volume", exc_info=True)
         self._delete_new_volume()
         raise
     finally:
         self._warn_about_volume_hosts()

示例#24

0

显示文件

文件： __init__.py 项目： ricrogz/StarCluster

 def _eval_add_node(self):
     """
     This function inspects the current state of the SGE queue and decides
     whether or not to add nodes to the cluster. Returns the number of nodes
     to add.
     """
     num_nodes = len(self._cluster.nodes)
     if num_nodes >= self.max_nodes:
         log.info("Not adding nodes: already at or above maximum (%d)" %
                  self.max_nodes)
         return
     queued_jobs = self.stat.get_queued_jobs()
     if not queued_jobs and num_nodes >= self.min_nodes:
         log.info("Not adding nodes: at or above minimum nodes "
                  "and no queued jobs...")
         return
     total_slots = self.stat.count_total_slots()
     if not self.has_cluster_stabilized() and total_slots > 0:
         return
     running_jobs = self.stat.get_running_jobs()
     used_slots = sum([int(j['slots']) for j in running_jobs])
     qw_slots = sum([int(j['slots']) for j in queued_jobs])
     slots_per_host = self.stat.slots_per_host()
     avail_slots = total_slots - used_slots
     need_to_add = 0
     if num_nodes < self.min_nodes:
         log.info("Adding node: below minimum (%d)" % self.min_nodes)
         need_to_add = self.min_nodes - num_nodes
     elif total_slots == 0:
         # no slots, add one now
         need_to_add = 1
     elif qw_slots > avail_slots:
         log.info("Queued jobs need more slots (%d) than available (%d)" %
                  (qw_slots, avail_slots))
         oldest_job_dt = self.stat.oldest_queued_job_age()
         now = self.get_remote_time()
         age_delta = now - oldest_job_dt
         if age_delta.seconds > self.longest_allowed_queue_time:
             log.info("A job has been waiting for %d seconds "
                      "longer than max: %d" %
                      (age_delta.seconds, self.longest_allowed_queue_time))
             if slots_per_host != 0:
                 need_to_add = qw_slots / slots_per_host
             else:
                 need_to_add = 1
         else:
             log.info("No queued jobs older than %d seconds" %
                      self.longest_allowed_queue_time)
     max_add = self.max_nodes - len(self._cluster.running_nodes)
     need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
     if need_to_add > 0:
         log.warn("Adding %d nodes at %s" %
                  (need_to_add, str(utils.get_utc_now())))
         try:
             self._cluster.add_nodes(need_to_add)
             self.__last_cluster_mod_time = utils.get_utc_now()
             log.info("Done adding nodes at %s" %
                      str(self.__last_cluster_mod_time))
         except Exception:
             log.error("Failed to add new host", exc_info=True)

示例#25

0

显示文件

文件： __init__.py 项目： godber/StarCluster

    def get_stats(self):
        """
        this function will ssh to the SGE master and get load & queue stats.
        it will feed these stats to SGEStats, which parses the XML.
        it will return two arrays: one of hosts, each host has a hash with its
        host information inside. The job array contains a hash for every job,
        containing statistics about the job name, priority, etc
        """
        log.debug("starting get_stats")
        master = self._cluster.master_node
        self.stat = SGEStats()

        qhostXml = ""
        qstatXml = ""
        qacct = ""
        try:
            now = self.get_remote_time()
            qatime = self.get_qatime(now)
            qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime
            qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml'
            qhostXml = '\n'.join(master.ssh.execute( \
                'source /etc/profile && qhost -xml', log_output=False))
            qstatXml = '\n'.join(master.ssh.execute(qstat_cmd,
                                                    log_output=False))
            qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=False, \
                                                 ignore_exit_status=True))
        except Exception, e:
            log.error("Error occured getting SGE stats via ssh. "\
                      "Cluster terminated?")
            log.error(e)
            return -1

示例#26

0

显示文件

文件： __init__.py 项目： godber/StarCluster

 def run(self, cluster):
     """
     This is a rough looping function. it will loop indefinitely, using
     SGELoadBalancer.get_stats() to get the clusters status. It will look
     at the job queue and try to decide whether to add or remove a node.
     It should later look at job durations. Doesn't yet.
     """
     self._cluster = cluster
     if not cluster.is_cluster_up():
         raise exception.ClusterNotRunning(cluster.cluster_tag)
     while(self._keep_polling):
         if not cluster.is_cluster_up():
             log.info("Entire cluster is not up, nodes added/removed. " + \
                      "No Action.")
             time.sleep(self.polling_interval)
             continue
         if self.get_stats() == -1:
             log.error("Failed to get stats. LoadBalancer is terminating.")
             return
         log.info("Oldest job is from %s. # queued jobs = %d. # hosts = %d."
                  % (self.stat.oldest_queued_job_age(),
                  len(self.stat.get_queued_jobs()), len(self.stat.hosts)))
         log.info("Avg job duration = %d sec, Avg wait time = %d sec." %
                  (self.stat.avg_job_duration(), self.stat.avg_wait_time()))
         #evaluate if nodes need to be added
         self._eval_add_node()
         #evaluate if nodes need to be removed
         self._eval_remove_node()
         #call the visualizer
         self._call_visualizer()
         #sleep for the specified number of seconds
         log.info("Sleeping, looping again in %d seconds.\n"
                  % self.polling_interval)
         time.sleep(self.polling_interval)

示例#27

0

显示文件

文件： main_w2io.py 项目： Pablites/W2IO

 def __fix_fetch(self, node):
     try:
         log.info('W2IO: ' + node.alias + ' Fixing fetch problem')
         node.ssh.execute(self.__remote_module_path + '/bin/fetch-problem.sh 2>&1 > ' + self.__remote_module_path + '/log/fetch-problem.log') 
     except:
         log.error('W2IO: ' + node.alias + ' Fixing fetch problem error')
         sys.exit('W2IO: ' + node.alias + ' Fixing fetch problem error')

示例#28

0

显示文件

文件： pvfs_controller.py 项目： Pablites/PabPlug

 def orangefs_instalation(self, node):    
     try:
         log.info('PabPlug: ' + node.alias + ' OrangeFS instalation')
         node.ssh.execute(self.__remote_module_path + '/bin/pvfs-instalation.sh 2>&1 > ' + self.__remote_module_path + '/log/pvfs-instalation.log') 
     except Exception as e:
         log.error('PabPlug: ' + node.alias + ' OrangeFS instalation error: '+str(e))
         sys.exit('PabPlug: ' + node.alias + ' OrangeFS instalation error: '+str(e))

示例#29

0

显示文件

文件： cli.py 项目： agua/StarCluster

 def bug_found(self):
     log.error("Oops! Looks like you've found a bug in StarCluster")
     log.error("Debug file written to: %s" % static.DEBUG_FILE)
     log.error("Look for lines starting with PID: %s" % static.PID)
     log.error("Please submit this file, minus any private information,")
     log.error("to [email protected]")
     sys.exit(1)

示例#30

0

显示文件

文件： cli.py 项目： quantumsummers/StarCluster

 def execute(self, args):
     if len(args) != 1:
         self.parser.error("please specify a <tag_name> for this cluster")
     cfg = self.cfg
     use_experimental = cfg.globals.get('enable_experimental')
     if self.opts.spot_bid is not None and not use_experimental:
         raise exception.ExperimentalFeature('Using spot instances')
     tag = self.tag = args[0]
     template = self.opts.cluster_template
     if not template:
         template = cfg.get_default_cluster_template(tag)
         log.info("Using default cluster template: %s" % template)
     cluster_exists = cluster.cluster_exists(tag, cfg)
     create = not self.opts.no_create
     if not cluster_exists and not create:
         raise exception.ClusterDoesNotExist(tag)
     scluster = cfg.get_cluster_template(template, tag)
     scluster.update(self.specified_options_dict)
     validate_running = self.opts.no_create
     validate_only = self.opts.validate_only
     try:
         scluster._validate(validate_running=validate_running)
         if validate_only:
             return
     except exception.ClusterValidationError,e:
         log.error('settings for cluster template "%s" are not valid:' % template)
         raise

示例#31

0

显示文件

 def set_trace():
     log.error("Unable to load PuDB")
     log.error("Please check that PuDB is installed and working.")
     log.error("If not, you can install it via: easy_install pudb")

示例#32

0

显示文件

 def ipy_shell(local_ns=None):
     log.error("Unable to load IPython:\n\n%s\n" % e)
     log.error("Please check that IPython is installed and working.")
     log.error("If not, you can install it via: easy_install ipython")

示例#33

0

显示文件

 def visualizer(self):
     if not self._visualizer:
         try:
             from starcluster.balancers.sge import visualizer
         except ImportError, e:
             log.error("Error importing visualizer:")
             log.error(str(e))
             log.error("check that matplotlib and numpy are installed and:")
             log.error("   $ python -c 'import matplotlib'")
             log.error("   $ python -c 'import numpy'")
             log.error("completes without error")
             raise exception.BaseException(
                 "Failed to load stats visualizer")
         self._visualizer = visualizer.SGEVisualizer(
             self.stats_file, self.plot_output_dir)

示例#34

0

显示文件

文件： __init__.py 项目： tfoss/StarCluster

    def _eval_add_node(self):
        """
        This function inspects the current state of the SGE queue and decides
        whether or not to add nodes to the cluster. Returns the number of nodes
        to add.
        """
        num_nodes = len(self._cluster.nodes)
        if num_nodes >= self.max_nodes:
            log.info("Not adding nodes: already at or above maximum (%d)" %
                     self.max_nodes)
            return False
        queued_jobs = self.stat.get_queued_jobs()
        if not queued_jobs and num_nodes >= self.min_nodes:
            log.info("Not adding nodes: at or above minimum nodes "
                     "and no queued jobs...")
            return False
        total_slots = self.stat.count_total_slots()
        if not self.has_cluster_stabilized() and total_slots > 0:
            return False
        running_jobs = self.stat.get_running_jobs()
        used_slots = sum([int(j['slots']) for j in running_jobs])
        qw_slots = sum([int(j['slots']) for j in queued_jobs])
        slots_per_host = self.stat.slots_per_host()
        avail_slots = total_slots - used_slots
        need_to_add = 0
        if num_nodes < self.min_nodes:
            log.info("Adding node: below minimum (%d)" % self.min_nodes)
            need_to_add = self.min_nodes - num_nodes
        elif total_slots == 0:
            # no slots, add one now
            need_to_add = 1
        elif qw_slots > avail_slots:
            log.info("Queued jobs need more slots (%d) than available (%d)" %
                     (qw_slots, avail_slots))
            oldest_job_dt = self.stat.oldest_queued_job_age()
            now = self.get_remote_time()
            age_delta = now - oldest_job_dt
            if age_delta.seconds > self.longest_allowed_queue_time:
                log.info("A job has been waiting for %d seconds "
                         "longer than max: %d" %
                         (age_delta.seconds, self.longest_allowed_queue_time))
                if slots_per_host != 0:
                    need_to_add = qw_slots / slots_per_host
                else:
                    need_to_add = 1
            else:
                log.info("No queued jobs older than %d seconds" %
                         self.longest_allowed_queue_time)
        max_add = self.max_nodes - len(self._cluster.running_nodes)
        need_to_add = min(self.add_nodes_per_iteration, need_to_add, max_add)
        if need_to_add < 1:
            return False

        log.warn("Adding %d nodes at %s" %
                 (need_to_add, str(utils.get_utc_now())))
        try:
            self._cluster.add_nodes(need_to_add,
                                    reboot_interval=self.reboot_interval,
                                    n_reboot_restart=self.n_reboot_restart,
                                    placement_group=self._placement_group,
                                    spot_bid=self._spot_bid,
                                    instance_type=self._instance_type)
            if num_nodes < len(self._cluster.nodes):
                self.__last_cluster_mod_time = utils.get_utc_now()
                log.info("Done adding nodes at %s" %
                         str(self.__last_cluster_mod_time))
            else:
                log.info("No nodes were successfully added.")
        except ThreadPoolException as tpe:
            traceback.print_exc()
            log.error("Failed to add new host", exc_info=True)
            log.debug(traceback.format_exc())
            log.error("Individual errors follow")
            for exc in tpe.exceptions:
                print exc[1]
        except Exception:
            traceback.print_exc()
            log.error("Failed to add new host", exc_info=True)
            log.debug(traceback.format_exc())
        return True

示例#35

0

显示文件

 def _check_ipython_installed(self, node):
     has_ipy = node.ssh.has_required(['ipython', 'ipcluster'])
     if not has_ipy:
         log.error("IPython is not installed...skipping plugin")
     return has_ipy

示例#36

0

显示文件

    def execute(self, command, silent=True, only_printable=False,
                ignore_exit_status=False, log_output=True, detach=False,
                source_profile=True, raise_on_failure=True):
        """
        Execute a remote command and return stdout/stderr

        NOTE: this function blocks until the process finishes

        kwargs:
        silent - don't print the command's output to the console
        only_printable - filter the command's output to allow only printable
                         characters
        ignore_exit_status - don't warn about non-zero exit status
        log_output - log all remote output to the debug file
        detach - detach the remote process so that it continues to run even
                 after the SSH connection closes (does NOT return output or
                 check for non-zero exit status if detach=True)
        source_profile - if True prefix the command with "source /etc/profile"
        raise_on_failure - raise exception.SSHError if command fails
        returns List of output lines
        """
        channel = self.transport.open_session()
        if detach:
            command = "nohup %s &" % command
            if source_profile:
                command = "source /etc/profile && %s" % command
            channel.exec_command(command)
            channel.close()
            self.__last_status = None
            return
        if source_profile:
            command = "source /etc/profile && %s" % command
        log.debug("executing remote command: %s" % command)
        channel.exec_command(command)
        output = self._get_output(channel, silent=silent,
                                  only_printable=only_printable)
        exit_status = channel.recv_exit_status()
        self.__last_status = exit_status
        out_str = '\n'.join(output)
        if exit_status != 0:
            msg = "remote command '%s' failed with status %d"
            msg %= (command, exit_status)
            if log_output:
                msg += ":\n%s" % out_str
            else:
                msg += " (no output log requested)"
            if not ignore_exit_status:
                if raise_on_failure:
                    raise exception.RemoteCommandFailed(
                        msg, command, exit_status, out_str)
                else:
                    log.error(msg)
            else:
                log.debug("(ignored) " + msg)
        else:
            if log_output:
                try:
                    log.debug("output of '%s':\n%s" % (command, out_str))
                except:
                    log.debug("Error writing outstring")

                
            else:
                log.debug("output of '%s' has been hidden" % command)
        return output

示例#37

0

显示文件

 def _completer(self):
     try:
         completion_list = [v.id for v in self.ec2.get_volumes()]
         return optcomplete.ListCompleter(completion_list)
     except Exception, e:
         log.error('something went wrong fix me: %s' % e)

示例#38

0

显示文件

文件： cli.py 项目： mmcclellan/StarCluster

 # Show StarCluster header
 self.print_header()
 # Parse subcommand options and args
 gopts, sc, opts, args = self.parse_subcommands()
 if args and args[0] == 'help':
     # make 'help' subcommand act like --help option
     sc.parser.print_help()
     sys.exit(0)
 # run the subcommand and handle exceptions
 try:
     sc.execute(args)
 except (EC2ResponseError, S3ResponseError, BotoServerError), e:
     log.error("%s: %s" % (e.error_code, e.error_message))
     sys.exit(1)
 except socket.error, e:
     log.error("Unable to connect: %s" % e)
     log.error("Check your internet connection?")
     sys.exit(1)
 except exception.ThreadPoolException, e:
     if not gopts.DEBUG:
         e.print_excs()
     log.debug(e.format_excs())
     print
     self.bug_found()
 except exception.ClusterDoesNotExist, e:
     cm = gopts.CONFIG.get_cluster_manager()
     cls = cm.get_clusters()
     log.error(e.msg)
     if cls:
         taglist = ', '.join([c.cluster_tag for c in cls])
         active_clusters = "(active clusters: %s)" % taglist

示例#39

0

显示文件

文件： cli.py 项目： neocxi/sc

 # Parse subcommand options and args
 gopts, sc, opts, args = self.parse_subcommands()
 if args and args[0] == 'help':
     # make 'help' subcommand act like --help option
     sc.parser.print_help()
     sys.exit(0)
 # run the subcommand and handle exceptions
 try:
     sc.execute(args)
 except (EC2ResponseError, S3ResponseError, BotoServerError), e:
     log.error("%s: %s" % (e.error_code, e.error_message),
               exc_info=True)
     sys.exit(1)
 except socket.error, e:
     log.exception("Connection error:")
     log.error("Check your internet connection?")
     sys.exit(1)
 except exception.ThreadPoolException, e:
     log.error(e.format_excs())
     self.bug_found()
 except exception.ClusterDoesNotExist, e:
     cm = gopts.CONFIG.get_cluster_manager()
     cls = ''
     try:
         cls = cm.get_clusters(load_plugins=False, load_receipt=False)
     except:
         log.debug("Error fetching cluster list", exc_info=True)
     log.error(e.msg)
     if cls:
         taglist = ', '.join([c.cluster_tag for c in cls])
         active_clusters = "(active clusters: %s)" % taglist

示例#40

0

显示文件

        missing_args = []
        for arg in args:
            if arg in plugin:
                config_args.append(plugin.get(arg))
            else:
                missing_args.append(arg)
        if debug:
            log.debug("config_args = %s" % config_args)
        if missing_args:
            raise exception.PluginError(
                "Not enough settings provided for plugin %s (missing: %s)" %
                (plugin_name, ', '.join(missing_args)))
        config_kwargs = {}
        for arg in kwargs:
            if arg in plugin:
                config_kwargs[arg] = plugin.get(arg)
        if debug:
            log.debug("config_kwargs = %s" % config_kwargs)
        try:
            plug_obj = klass(*config_args, **config_kwargs)
        except Exception as exc:
            log.error("Error occured:", exc_info=True)
            raise exception.PluginLoadError(
                "Failed to load plugin %s with "
                "the following error: %s - %s" %
                (setup_class, exc.__class__.__name__, exc.message))
        if not hasattr(plug_obj, '__name__'):
            setattr(plug_obj, '__name__', plugin_name)
        plugs.append(plug_obj)
    return plugs

示例#41

0

显示文件

文件： shell.py 项目： karanindia/HPC-StarCluster

class CmdShell(CmdBase):
    """
    shell

    Load an interactive IPython shell configured for starcluster development

    The following objects are automatically available at the prompt:

        cfg - starcluster.config.StarClusterConfig instance
        cm - starcluster.cluster.ClusterManager instance
        ec2 - starcluster.awsutils.EasyEC2 instance
        s3 - starcluster.awsutils.EasyS3 instance

    All StarCluster modules are automatically imported in the IPython session
    along with all StarCluster dependencies (e.g. boto, ssh, etc.)

    If the --ipcluster=CLUSTER (-p) is passed, the IPython session will be
    automatically be configured to connect to the remote CLUSTER using
    IPython's parallel interface (requires IPython 0.11+). In this mode you
    will have the following additional objects available at the prompt:

        ipcluster - starcluster.cluster.Cluster instance for the cluster
        ipclient - IPython.parallel.Client instance for the cluster
        ipview - IPython.parallel.client.view.DirectView for the cluster

    Here's an example of how to run a parallel map across all nodes in the
    cluster:

        [~]> ipclient.ids
        [0, 1, 2, 3]
        [~]> res = ipview.map_async(lambda x: x**30, range(8))
        [~]> print res.get()
        [0,
         1,
         1073741824,
         205891132094649L,
         1152921504606846976L,
         931322574615478515625L,
         221073919720733357899776L,
         22539340290692258087863249L]

    See IPython parallel docs for more details
    (http://ipython.org/ipython-doc/stable/parallel)
    """

    names = ['shell', 'sh']

    def _add_to_known_hosts(self, node):
        log.info("Configuring local known_hosts file")
        user_home = os.path.expanduser('~')
        khosts = os.path.join(user_home, '.ssh', 'known_hosts')
        if not os.path.isfile(khosts):
            log.warn("Unable to configure known_hosts: file does not exist")
            return
        contents = open(khosts).read()
        if node.dns_name not in contents:
            server_pkey = node.ssh.get_server_public_key()
            khostsf = open(khosts, 'a')
            if contents[-1] != '\n':
                khostsf.write('\n')
            name_entry = '%s,%s' % (node.dns_name, node.ip_address)
            khostsf.write(' '.join([
                name_entry,
                server_pkey.get_name(),
                base64.b64encode(str(server_pkey)), '\n'
            ]))
            khostsf.close()

    def addopts(self, parser):
        parser.add_option("-p",
                          "--ipcluster",
                          dest="ipcluster",
                          action="store",
                          type="string",
                          default=None,
                          metavar="CLUSTER",
                          help="configure a parallel "
                          "IPython session on CLUSTER")

    def execute(self, args):
        local_ns = dict(cfg=self.cfg,
                        ec2=self.ec2,
                        s3=self.s3,
                        cm=self.cm,
                        starcluster=starcluster,
                        log=log)
        if self.opts.ipcluster:
            log.info("Loading parallel IPython library")
            try:
                from IPython.parallel import Client
            except ImportError, e:
                self.parser.error(
                    "Error loading parallel IPython:"
                    "\n\n%s\n\n"
                    "NOTE: IPython 0.11+ must be installed to use -p" % e)
            tag = self.opts.ipcluster
            cl = self.cm.get_cluster(tag)
            region = cl.master_node.region.name
            ipcluster_dir = os.path.join(static.STARCLUSTER_CFG_DIR,
                                         'ipcluster')
            local_json = os.path.join(ipcluster_dir,
                                      "%s-%s.json" % (tag, region))
            if not os.path.exists(local_json):
                user_home = cl.master_node.getpwnam(cl.cluster_user).pw_dir
                profile_dir = posixpath.join(user_home, '.ipython',
                                             'profile_default')
                json = posixpath.join(profile_dir, 'security',
                                      'ipcontroller-client.json')
                if cl.master_node.ssh.isfile(json):
                    log.info("Fetching connector file from cluster...")
                    if not os.path.exists(ipcluster_dir):
                        os.makedirs(ipcluster_dir)
                    cl.master_node.ssh.get(json, local_json)
                else:
                    self.parser.error(
                        "IPython json file %s does not exist locally or on "
                        "the cluster. Make sure the ipcluster plugin has "
                        "been executed and completed successfully.")
            key_location = cl.master_node.key_location
            self._add_to_known_hosts(cl.master_node)
            log.info("Loading parallel IPython client and view")
            rc = Client(local_json, sshkey=key_location)
            local_ns['Client'] = Client
            local_ns['ipcluster'] = cl
            local_ns['ipclient'] = rc
            local_ns['ipview'] = rc[:]
        modules = [(starcluster.__name__ + '.' + module, module)
                   for module in starcluster.__all__]
        modules += [('boto', 'boto'), ('paramiko', 'paramiko'),
                    ('workerpool', 'workerpool'), ('jinja2', 'jinja2'),
                    ('Crypto', 'Crypto'), ('iptools', 'iptools')]
        for fullname, modname in modules:
            log.info('Importing module %s' % modname)
            try:
                __import__(fullname)
                local_ns[modname] = sys.modules[fullname]
            except ImportError, e:
                log.error("Error loading module %s: %s" % (modname, e))

示例#42

0

显示文件

    def parse_subcommands(self, gparser, subcmds):
        """
        Parse given global arguments, find subcommand from given list of
        subcommand objects, parse local arguments and return a tuple of
        global options, selected command object, command options, and
        command arguments.

        Call execute() on the command object to run. The command object has
        members 'gopts' and 'opts' set for global and command options
        respectively, you don't need to call execute with those but you could
        if you wanted to.
        """
        print self.get_description()

        # Build map of name -> command and docstring.
        cmds_header = 'Available Commands:'
        gparser.usage += '\n\n%s\n' % cmds_header
        gparser.usage += '%s\n' % ('-' * len(cmds_header))
        gparser.usage += "NOTE: Pass --help to any command for a list of its "
        gparser.usage += 'options and detailed usage information\n\n'
        for sc in subcmds:
            helptxt = sc.__doc__.splitlines()[3].strip()
            gparser.usage += '- %s: %s\n' % (', '.join(sc.names), helptxt)
            for n in sc.names:
                assert n not in self.subcmds_map
                self.subcmds_map[n] = sc

        # Declare and parse global options.
        gparser.disable_interspersed_args()

        gopts, args = gparser.parse_args()
        if not args:
            gparser.print_help()
            raise SystemExit("\nError: you must specify an action.")
        subcmdname, subargs = args[0], args[1:]

        ####  CHANGED
        """
            CHANGE LOG FILE TO USER-SUPPLIED LOCATION IF PROVIDED
        """
        if gopts.LOGFILE:
            static.DEBUG_FILE = gopts.LOGFILE
            #### REM: THIS REMOVES starcluster.logger.ConsoleLogger HANDLER
            while len(log.handlers) > 0:
                log.removeHandler(log.handlers[0])
            logger.configure_sc_logging()

        # set debug level if specified
        if gopts.DEBUG:
            console.setLevel(logger.DEBUG)

        #### CHANGED
        """
            ADDED config_file TO utils FOR USE BY commands.start.addopts
        """
        utils.config_file = gopts.CONFIG

        # load StarClusterConfig into global options
        try:
            cfg = config.StarClusterConfig(gopts.CONFIG)
            cfg.load()
        except exception.ConfigNotFound, e:
            log.error(e.msg)
            e.display_options()
            sys.exit(1)

示例#43

0

显示文件

文件： clustersetup.py 项目： mmcclellan/StarCluster

 def _setup_ebs_volumes(self):
     """
     Mount EBS volumes, if specified in ~/.starcluster/config to /home
     """
     # setup /etc/fstab on master to use block device if specified
     master = self._master
     devs = master.ssh.ls('/dev')
     for vol in self._volumes:
         vol = self._volumes[vol]
         vol_id = vol.get("volume_id")
         mount_path = vol.get('mount_path')
         device = vol.get("device")
         volume_partition = vol.get('partition')
         if not (vol_id and device and mount_path):
             log.error("missing required settings for vol %s" % vol)
             continue
         dev_exists = master.ssh.path_exists(device)
         if not dev_exists and device.startswith('/dev/sd'):
             # check for "correct" device in unpatched kernels
             device = device.replace('/dev/sd', '/dev/xvd')
             dev_exists = master.ssh.path_exists(device)
         if not dev_exists:
             log.warn("Cannot find device %s for volume %s" %
                      (device, vol_id))
             log.warn("Not mounting %s on %s" % (vol_id, mount_path))
             log.warn("This usually means there was a problem "
                      "attaching the EBS volume to the master node")
             continue
         if not volume_partition:
             partitions = filter(lambda x: x.startswith(device), devs)
             if len(partitions) == 1:
                 volume_partition = device
             elif len(partitions) == 2:
                 volume_partition = device + '1'
             else:
                 log.error(
                     "volume has more than one partition, please specify "
                     "which partition to use (e.g. partition=0, "
                     "partition=1, etc.) in the volume's config")
                 continue
         elif not master.ssh.path_exists(volume_partition):
             log.warn("Cannot find partition %s on volume %s" %
                      (volume_partition, vol_id))
             log.warn("Not mounting %s on %s" % (vol_id, mount_path))
             log.warn("This either means that the volume has not "
                      "been partitioned or that the partition"
                      "specified does not exist on the volume")
             continue
         log.info("Mounting EBS volume %s on %s..." % (vol_id, mount_path))
         mount_map = self._master.get_mount_map()
         dev = mount_map.get(volume_partition)
         if dev:
             path, fstype, options = dev
             if path != mount_path:
                 log.error("Volume %s is mounted on %s, not on %s" %
                           (vol_id, path, mount_path))
             else:
                 log.info("Volume %s already mounted on %s...skipping" %
                          (vol_id, mount_path))
             continue
         self._master.mount_device(volume_partition, mount_path)

示例#44

0

显示文件

 def execute(self, args):
     if len(args) != 1:
         self.parser.error("please specify a <cluster_tag>")
     tag = self.tag = args[0]
     create = not self.opts.no_create
     create_only = self.opts.create_only
     cluster_exists = self.cm.get_cluster_or_none(tag)
     validate = self.opts.validate
     validate_running = self.opts.no_create
     validate_only = self.opts.validate_only
     if cluster_exists and create:
         stopped_ebs = cluster_exists.is_cluster_stopped()
         is_ebs = False
         if not stopped_ebs:
             is_ebs = cluster_exists.is_ebs_cluster()
         raise exception.ClusterExists(tag,
                                       is_ebs=is_ebs,
                                       stopped_ebs=stopped_ebs)
     if not cluster_exists and not create:
         raise exception.ClusterDoesNotExist(tag)
     scluster = None
     if cluster_exists:
         validate_running = True
         scluster = self.cm.get_cluster(tag)
         log.info(
             "Using original template used to launch cluster '%s'" % \
             scluster.cluster_tag)
     else:
         template = self.opts.cluster_template
         if not template:
             template = self.cm.get_default_cluster_template()
             log.info("Using default cluster template: %s" % template)
         scluster = self.cm.get_cluster_template(template, tag)
     scluster.update(self.specified_options_dict)
     if not self.opts.refresh_interval:
         interval = self.cfg.globals.get("refresh_interval")
         scluster.refresh_interval = interval
     if validate:
         try:
             scluster._validate(validate_running=validate_running)
         except exception.ClusterValidationError:
             if not cluster_exists:
                 log.error(
                     'settings for cluster template "%s" are not valid:' % \
                     template)
             raise
     else:
         log.warn("SKIPPING VALIDATION - USE AT YOUR OWN RISK")
     if validate_only:
         return
     if self.opts.spot_bid is not None and not self.opts.no_create:
         cmd = ' '.join(sys.argv[1:])
         cmd = cmd.replace('--no-create', '').replace('-x', '')
         cmd += ' -x'
         msg = user_msgs.spotmsg % {
             'cmd': cmd,
             'size': scluster.cluster_size,
             'tag': tag
         }
         self.warn_experimental(msg, num_secs=5)
     self.catch_ctrl_c()
     scluster.start(create=create, create_only=create_only, validate=False)
     if self.opts.login_master:
         scluster.ssh_to_master()

示例#45

0

显示文件

 def _call_visualizer(self):
     if not self._visualizer_on:
         return
     try:
         from starcluster.balancers.sge import visualizer
     except ImportError, e:
         log.error("Error importing matplotlib and numpy:")
         log.error(str(e))
         log.error("check that matplotlib and numpy are installed and:")
         log.error("   $ python -c 'import matplotlib'")
         log.error("   $ python -c 'import numpy'")
         log.error("completes without error")
         log.error("Visualizer has been disabled.")
         #turn the visualizer off, but keep going.
         self._visualizer_on = False
         return

示例#46

0

显示文件

    def clean_cluster(self, nodes, master, user, user_shell, volumes):
        """
        Run qhost to find nodes that are present in OGS but not in the cluster
        in order to remove them.
        """
        self._master = master
        self._nodes = nodes

        qhost_xml = master.ssh.execute("qhost -xml", source_profile=True)
        qhost_et = ET.fromstringlist(qhost_xml)
        qhosts = []
        for host in qhost_et:
            h_name = host.attrib['name']
            if h_name != 'global':
                qhosts.append(h_name)

        if len(qhosts) == 0:
            log.info("Nothing to clean")

        alive_nodes = [node.alias for node in nodes]

        cleaned = []
        # find dead hosts
        for node_alias in qhosts:
            if node_alias not in alive_nodes:
                cleaned.append(node_alias)

        # find jobs running in dead hosts
        qstats_xml = self._master.ssh.execute("qstat -u \"*\" -xml",
                                              source_profile=True)
        qstats_xml[1:]  # remove first line
        qstats_et = ET.fromstringlist(qstats_xml)
        to_delete = []
        to_repair = []
        cleaned_queue = []  # not a lambda function to allow pickling
        for c in cleaned:
            cleaned_queue.append("all.q@" + c)
        for job_list in qstats_et.find("queue_info").findall("job_list"):
            if job_list.find("queue_name").text in cleaned_queue:
                job_number = job_list.find("JB_job_number").text
                to_delete.append(job_number)
        for job_list in qstats_et.find("job_info").findall("job_list"):
            if job_list.find("state").text == "Eqw":
                job_number = job_list.find("JB_job_number").text
                to_repair.append(job_number)
        # delete the jobs
        if to_delete:
            log.info("Stopping jobs: " + str(to_delete))
            self._master.ssh.execute("qdel -f " + " ".join(to_delete))
            time.sleep(3)  # otherwise might provoke LOST QRSH if on last job
        if to_repair:
            log.error("Reseting jobs: " + str(to_repair))
            self._master.ssh.execute("qmod -cj " + " ".join(to_repair),
                                     ignore_exit_status=True)

        # stuck qrsh issue
        ps_wc = int(self._master.ssh.execute("ps -ef | grep qrsh | wc -l")[0])
        qstat_wc = int(self._master.ssh.execute("qstat -u \"*\" | wc -l")[0])
        if qstat_wc == 0 and ps_wc > 2:
            log.error("LOST QRSH??")
            log.error("pkill -9 qrsh")
            self._master.ssh.execute("pkill -9 qrsh", ignore_exit_status=True)
        # ----------------------------------

        # delete the host config
        for c in cleaned:
            log.info("Cleaning node " + c)
            if len(master.ssh.get_remote_file_lines("/etc/hosts", c)) == 0:
                log.warn(c + " is missing from /etc/hosts, creating a dummy "
                         "entry 1.1.1.1")
                rfile = master.ssh.remote_file("/etc/hosts", 'a')
                rfile.write("1.1.1.1 " + c + "\n")
                rfile.close()

            try:
                self._remove_from_sge(DeadNode(c), only_clean_master=True)
            except RemoteCommandFailed:
                log.warning("Failed to remove node {} from sge."
                            .format(c), exc_info=True)

        # fix to allow pickling
        self._master = None
        self._nodes = None