Exemplo n.º 1
0
    def run(self, nodes, master, user, user_shell, volumes):

        #master.ssh.execute(
            #"killall -9 pbs_server; killall -9 pbs_sched; CLEAN_DELAY=0 emerge -C torque; rm -rvf /var/spool/torque; FEATURES=buildpkg emerge -g -j torque",
            #silent=False)
        #import IPython; ipshell = IPython.embed; ipshell(banner1='ipshell')

        # -- configure torque's server and scheduler on the master node
        log.info("Configuring torque server...")
        master.ssh.execute(master_configure_server)

        # -- configure torque's clients on each node and complete the
        # configuration on the master node
        for node in nodes[1:]:
            log.info("Configuring torque node '%s'..." % node.alias)
            node.ssh.execute(node_configure_mom)
            self._add_torque_node_to_master(node, master)

        # -- (re)start services
        log.info("Starting torque services...")
        self._force_deamon_restart(master, 'pbs_server')
        for node in nodes[1:]:
            self._start_torque_node_daemon(node)
        self._force_deamon_restart(master, 'pbs_sched')

        # -- print infos / debug
        log.debug("Torque server information:")
        master.ssh.execute("qmgr -c 'l s'")
        master.ssh.execute("qmgr -c 'p s'")

        log.debug("Torque nodes information:")
        for node in nodes[1:]:
            master.ssh.execute('momctl -h %s -d 2' % node.alias)
        master.ssh.execute("qnodes")
Exemplo n.º 2
0
 def _find_node_for_removal(self):
     """
     This function will find a suitable node to remove from the cluster.
     The criteria for removal are:
     1. The node must not be running any SGE job
     2. The node must have been up for 50-60 minutes past its start time
     3. The node must not be the master, or allow_master_kill=True
     """
     nodes = self._cluster.running_nodes
     to_rem = []
     for node in nodes:
         if not self.allow_master_kill and \
                 node.id == self._cluster.master_node.id:
             log.debug("not removing master node")
             continue
         is_working = self.stat.is_node_working(node)
         mins_up = self._minutes_uptime(node) % 60
         if not is_working:
             log.info("Idle Node %s (%s) has been up for %d minutes " \
                      "past the hour."
                   % (node.id, node.alias, mins_up))
         if self.polling_interval > 300:
             self.kill_after = \
             max(45, 60 - (2 * self.polling_interval / 60))
         if not is_working and mins_up >= self.kill_after:
             to_rem.append(node)
     return to_rem
Exemplo n.º 3
0
Arquivo: sge.py Projeto: agua/aguadev
 def settingsCommand(self):
     target = self.root + "/" + self.cell + "/common"
     cmd = 'cd ' + target + '; '
     cmd += self.exportEnvironmentVars()
     cmd += self.root + '/util/create_settings.sh ' + target
     log.debug("sge.CreateCell.createSettings    cmd: %s", cmd)
     return cmd
Exemplo n.º 4
0
 def load(self):
     """
     Populate this config object from the StarCluster config
     """
     log.debug('Loading config')
     try:
         self.globals = self._load_section('global', self.global_settings)
     except exception.ConfigSectionMissing:
         pass
     try:
         self.aws = self._load_section('aws info', self.aws_settings)
     except exception.ConfigSectionMissing:
         log.warn("No [aws info] section found in the config!")
     self.aws.update(self.get_settings_from_env(self.aws_settings))
     self.keys = self._load_sections('key', self.key_settings)
     self.vols = self._load_sections('volume', self.volume_settings)
     self.vols.update(self._load_sections('vol', self.volume_settings))
     self.plugins = self._load_sections('plugin',
                                        self.plugin_settings,
                                        filter_settings=False)
     self.permissions = self._load_sections('permission',
                                            self.permission_settings)
     sections = self._get_sections('cluster')
     self.clusters = self._load_cluster_sections(sections)
     return self
Exemplo n.º 5
0
 def addEnvarsToProfile(self, node):
     """
         Add environment variables (SGE_CELL, ports, etc.) to /etc/profile
     """
     envars = self.exportEnvironmentVars();
     log.debug("sge.addEnvarsToProfile    envars: echo '%s' >> /etc/profile", envars)
     node.ssh.execute("echo '" + envars + "' >> /etc/profile")
Exemplo n.º 6
0
	def _probe_peers(self, master, nodes):
		cmd = ""
		log.info("Probing %d nodes" % len(nodes))
		for node in nodes:
			cmd += "/usr/sbin/gluster peer probe %s;" % node.alias
			log.debug(master.ssh.execute(cmd))
		log.debug(master.ssh.execute("/usr/sbin/gluster peer status"))
Exemplo n.º 7
0
    def run(self, nodes, master, user, user_shell, volumes):
        """
            Mount NFS shares on master and all nodes
        """
        log.info("Running plugin automount")
        log.debug(
            "automount.NfsShares.run    automount.NfsShares.run(nodes, master, user, user_shell, volumes)"
        )

        #### OPEN NFS-RELATED PORTS FOR THIS CLUSTER
        self.openNfsPorts("default")
        self.openNfsPorts('@sc-' + self.cluster)

        #### SET HEAD NODE INTERNAL IP
        self.getHeadIp()

        #### FIX mountd PORT ON head AND MASTER/NODES
        mountdport = "32767"
        for node in nodes:
            self.setMountdOnNode(node, mountdport)

        self.setMountdOnHead(mountdport)
        self.restartServicesOnHead()

        #### MOUNT ON ALL NODES
        for node in nodes:
            self.mount(node)

        log.info("Completed plugin automount")
Exemplo n.º 8
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     if len(self.stat.hosts) <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)" %
                  self.min_nodes)
         return
     max_remove = len(self.stat.hosts) - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = datetime.datetime.utcnow()
         except Exception:
             log.error("Failed to remove node %s" % node.alias)
             log.debug(traceback.format_exc())
Exemplo n.º 9
0
    def enableSchedulingInfo(self):
        """
            Enable job scheduling info output for 'qstat -j'
        """
        log.info("Enabling job scheduling info")

        envars = self.exportEnvironmentVars()
        log.debug(envars + self.rootpath + "/qconf -ssconf")
        queue_template = subprocess.Popen(envars + self.rootpath + "/qconf -ssconf", stdout=subprocess.PIPE, shell=True).stdout.read()
        log.debug("sge.CreateCell.enableSchedulingInfo    BEFORE queue_template: %s", queue_template)

        match = "schedd_job_info                   false"
        insert = "schedd_job_info                   true"
        queue_template = string.replace(queue_template, match, insert)
        log.debug("sge.CreateCell.enableSchedulingInfo    AFTER queue_template: %s", queue_template)

        pid = os.getpid()
        filename = "/tmp/queue-" + str(os.getpid()) + ".txt"
        queue_file = open(filename, 'w')
        print >> queue_file, queue_template
        queue_file.close()
        
        cmd = envars + self.rootpath + "/qconf -Msconf " + filename
        log.debug(cmd)
        os.system(cmd)
        remove = "rm -fr " + filename
        log.debug(remove)
        os.system(remove)
Exemplo n.º 10
0
 def get(self, remotepaths, localpath=''):
     """
     Copies one or more files from the remote host to the local host.
     """
     remotepaths = self._make_list(remotepaths)
     localpath = localpath or os.getcwd()
     globs = []
     noglobs = []
     for rpath in remotepaths:
         if glob.has_magic(rpath):
             globs.append(rpath)
         else:
             noglobs.append(rpath)
     globresults = [self.glob(g) for g in globs]
     remotepaths = noglobs
     for globresult in globresults:
         remotepaths.extend(globresult)
     recursive = False
     for rpath in remotepaths:
         if not self.path_exists(rpath):
             raise exception.BaseException(
                 "Remote file or directory does not exist: %s" % rpath)
     for rpath in remotepaths:
         if self.isdir(rpath):
             recursive = True
             break
     try:
         self.scp.get(remotepaths, local_path=localpath,
                      recursive=recursive)
     except Exception, e:
         log.debug("get failed: remotepaths=%s, localpath=%s",
                   str(remotepaths), localpath)
         raise exception.SCPException(str(e))
Exemplo n.º 11
0
 def execute(self, args):
     if len(args) < 3:
         self.parser.error("please specify a cluster, remote file or " +
                           "directory, and a local destination path")
     ctag = args[0]
     lpath = args[-1]
     rpaths = args[1:-1]
     cl = self.cm.get_cluster(ctag, load_receipt=False)
     try:
         node = cl.get_node(self.opts.node)
     except exception.InstanceDoesNotExist as ide:
         if self.opts.node == "master":
             #may have happened because master node is clustername-master
             #i.e. dns_prefix = True in config
             #lets check
             try:
                 node = cl.get_node('%s-%s' % (ctag, self.opts.node))
             except exception.InstanceDoesNotExist as ide2:
                 #k, master is just not there, raise original error
                 log.debug("Neither master nor %s-%s exist." %
                           (ctag, self.opts.node))
                 raise (ide)
         else:
             #node name was provided
             raise
     if self.opts.user:
         node.ssh.switch_user(self.opts.user)
     for rpath in rpaths:
         if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath):
             raise exception.BaseException(
                 "Remote file or directory does not exist: %s" % rpath)
     node.ssh.get(rpaths, lpath)
Exemplo n.º 12
0
Arquivo: sge.py Projeto: agua/aguadev
    def enableSchedulingInfo(self):
        """
            Enable job scheduling info output for 'qstat -j'
        """
        log.info("Enabling job scheduling info")

        envars = self.exportEnvironmentVars()
        log.debug(envars + self.rootpath + "/qconf -ssconf")
        queue_template = subprocess.Popen(envars + self.rootpath +
                                          "/qconf -ssconf",
                                          stdout=subprocess.PIPE,
                                          shell=True).stdout.read()
        log.debug(
            "sge.CreateCell.enableSchedulingInfo    BEFORE queue_template: %s",
            queue_template)

        match = "schedd_job_info                   false"
        insert = "schedd_job_info                   true"
        queue_template = string.replace(queue_template, match, insert)
        log.debug(
            "sge.CreateCell.enableSchedulingInfo    AFTER queue_template: %s",
            queue_template)

        pid = os.getpid()
        filename = "/tmp/queue-" + str(os.getpid()) + ".txt"
        queue_file = open(filename, 'w')
        print >> queue_file, queue_template
        queue_file.close()

        cmd = envars + self.rootpath + "/qconf -Msconf " + filename
        log.debug(cmd)
        os.system(cmd)
        remove = "rm -fr " + filename
        log.debug(remove)
        os.system(remove)
Exemplo n.º 13
0
 def scp(self):
     """Initialize the SCP client."""
     if not self._scp or not self._scp.transport.is_active():
         log.debug("creating scp connection")
         self._scp = scp.SCPClient(self.transport,
                                   progress=self._file_transfer_progress)
     return self._scp
Exemplo n.º 14
0
 def settingsCommand(self):
     target  =   self.root + "/" + self.cell + "/common"
     cmd     =   'cd ' + target + '; '
     cmd     +=  self.exportEnvironmentVars()
     cmd     +=  self.root + '/util/create_settings.sh ' + target
     log.debug("sge.CreateCell.createSettings    cmd: %s", cmd)
     return cmd
Exemplo n.º 15
0
 def scp(self):
     """Initialize the SCP client."""
     if not self._scp or not self._scp.transport.is_active():
         log.debug("creating scp connection")
         self._scp = scp.SCPClient(self.transport,
                                   progress=self._file_transfer_progress)
     return self._scp
Exemplo n.º 16
0
 def connect(self, host=None, username=None, password=None,
             private_key=None, private_key_pass=None, port=None, timeout=30,
             compress=None):
     host = host or self._host
     username = username or self._username
     password = password or self._password
     compress = compress or self._compress
     port = port if port is not None else self._port
     pkey = self._pkey
     if private_key:
         pkey = self.load_private_key(private_key, private_key_pass)
     log.debug("connecting to host %s on port %d as user %s" % (host, port,
                                                                username))
     try:
         sock = self._get_socket(host, port)
         transport = paramiko.Transport(sock)
         transport.banner_timeout = timeout
     except socket.error:
         raise exception.SSHConnectionError(host, port)
     # Enable/disable compression
     transport.use_compression(compress)
     # Authenticate the transport.
     try:
         transport.connect(username=username, pkey=pkey, password=password)
     except paramiko.AuthenticationException:
         raise exception.SSHAuthException(username, host)
     except paramiko.SSHException, e:
         msg = e.args[0]
         raise exception.SSHError(msg)
Exemplo n.º 17
0
    def addParallelEnvironment(self, master):
        """
            Add 'threaded' parallel environment
        """
        log.info("Adding 'threaded' parallel environment")

        sge_pe_template = """
        pe_name           threaded
        slots             %s
        user_lists        NONE
        xuser_lists       NONE
        start_proc_args   /bin/true
        stop_proc_args    /bin/true
        allocation_rule   $pe_slots
        control_slaves    TRUE
        job_is_first_task FALSE
        urgency_slots     min
        accounting_summary FALSE
        """
        
        log.debug("addParallelEnvironment    sge_pe_template: %s", sge_pe_template)
        
        #### PRINT TEMPLATE FILE
        pe_file = master.ssh.remote_file("/tmp/pe.txt")
        print >> pe_file, sge_pe_template % 99999
        pe_file.close()
        
        envars = self.exportEnvironmentVars()
        
        rootpath = self.getRootPath(master)
        log.debug("CreateCell.addParallelEnvironment    rootpath: %s", rootpath)

        master.ssh.execute(envars + rootpath + "/qconf -Ap %s &> /tmp/pe.out" % pe_file.name)
        master.ssh.execute(envars + rootpath + '/qconf -mattr queue pe_list "threaded" all.q &> /tmp/pe2q.out')
Exemplo n.º 18
0
 def __init__(self,enable_hvmem="True",master_slots=0):
     if enable_hvmem == "False":
         self.enable_hvmem = False
     else:
         self.enable_hvmem = True
     self.master_slots = master_slots
     log.debug("enable_hvmem = %s , master_slots = %s" % (self.enable_hvmem, self.master_slots))
Exemplo n.º 19
0
 def execute(self, args):
     if not args:
         cls = [
             c.cluster_tag for c in self.cm.get_clusters(load_plugins=False,
                                                         load_receipt=False)
         ]
         msg = "please specify a cluster"
         if cls:
             opts = ', '.join(cls)
             msg = " ".join([msg, '(options:', opts, ')'])
         self.parser.error(msg)
     for cluster_name in args:
         try:
             cl = self.cm.get_cluster(cluster_name)
         except exception.ClusterDoesNotExist:
             raise
         except Exception, e:
             log.debug("Failed to load cluster settings!", exc_info=True)
             log.error("Failed to load cluster settings!")
             if self.opts.force:
                 log.warn("Ignoring cluster settings due to --force option")
                 cl = self.cm.get_cluster(cluster_name,
                                          load_receipt=False,
                                          require_keys=False)
             else:
                 if not isinstance(e, exception.IncompatibleCluster):
                     log.error("Use -f to forcefully stop the cluster")
                 raise
         is_stoppable = cl.is_stoppable()
         if not is_stoppable:
             has_stoppable_nodes = cl.has_stoppable_nodes()
             if not self.opts.terminate_unstoppable and has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' contains 'stoppable' and 'unstoppable' "
                     "nodes. Your options are:\n\n"
                     "1. Use the --terminate-unstoppable option to "
                     "stop all 'stoppable' nodes and terminate all "
                     "'unstoppable' nodes\n\n"
                     "2. Use the 'terminate' command to destroy the "
                     "cluster.\n\nPass --help for more info." %
                     cluster_name)
             if not has_stoppable_nodes:
                 raise exception.BaseException(
                     "Cluster '%s' does not contain any 'stoppable' nodes "
                     "and can only be terminated. Please use the "
                     "'terminate' command instead to destroy the cluster."
                     "\n\nPass --help for more info" % cluster_name)
         if not self.opts.confirm:
             resp = raw_input("Stop cluster %s (y/n)? " % cluster_name)
             if resp not in ['y', 'Y', 'yes']:
                 log.info("Aborting...")
                 continue
         cl.stop_cluster(self.opts.terminate_unstoppable,
                         force=self.opts.force)
         log.warn("All non-spot, EBS-backed nodes are now in a "
                  "'stopped' state")
         log.warn("You can restart this cluster by passing -x "
                  "to the 'start' command")
         log.warn("Use the 'terminate' command to *completely* "
                  "terminate this cluster")
Exemplo n.º 20
0
 def _find_node_for_removal(self):
     """
     This function will find a suitable node to remove from the cluster.
     The criteria for removal are:
     1. The node must not be running any SGE job
     2. The node must have been up for 50-60 minutes past its start time
     3. The node must not be the master, or allow_master_kill=True
     """
     nodes = self._cluster.running_nodes
     to_rem = []
     for node in nodes:
         if not self.allow_master_kill and \
                 node.id == self._cluster.master_node.id:
             log.debug("not removing master node")
             continue
         is_working = self.stat.is_node_working(node)
         mins_up = self._minutes_uptime(node) % 60
         if not is_working:
             log.info("Idle Node %s (%s) has been up for %d minutes " \
                      "past the hour."
                   % (node.id, node.alias, mins_up))
         if self.polling_interval > 300:
             self.kill_after = \
             max(45, 60 - (2 * self.polling_interval / 60))
         if not is_working and mins_up >= self.kill_after:
             to_rem.append(node)
     return to_rem
Exemplo n.º 21
0
 def _get_stats(self):
     master = self._cluster.master_node
     now = self.get_remote_time()
     qatime = self.get_qatime(now)
     qacct_cmd = 'qacct -j -b ' + qatime
     qstat_cmd = 'qstat -u \* -xml'
     qhostxml = '\n'.join(
         master.ssh.execute('qhost -xml',
                            log_output=True,
                            source_profile=True,
                            raise_on_failure=True))
     qstatxml = '\n'.join(
         master.ssh.execute(qstat_cmd,
                            log_output=True,
                            source_profile=True,
                            raise_on_failure=True))
     qacct = '\n'.join(
         master.ssh.execute(qacct_cmd,
                            log_output=True,
                            ignore_exit_status=True,
                            source_profile=True))
     stats = SGEStats()
     stats.parse_qhost(qhostxml)
     stats.parse_qstat(qstatxml, queues=["all.q", ""])
     stats.parse_qacct(qacct, now)
     log.debug("sizes: qhost: %d, qstat: %d, qacct: %d" %
               (len(qhostxml), len(qstatxml), len(qacct)))
     return stats
Exemplo n.º 22
0
 def execute(self, args):
     if len(args) < 3:
         self.parser.error("please specify a cluster, remote file or " +
                           "directory, and a local destination path")
     ctag = args[0]
     lpath = args[-1]
     rpaths = args[1:-1]
     cl = self.cm.get_cluster(ctag, load_receipt=False)
     try:
         node = cl.get_node(self.opts.node)
     except exception.InstanceDoesNotExist as ide:
         if self.opts.node == "master":
             #may have happened because master node is clustername-master
             #i.e. dns_prefix = True in config
             #lets check
             try:
                 node = cl.get_node('%s-%s' % (ctag, self.opts.node) )
             except exception.InstanceDoesNotExist as ide2:
                 #k, master is just not there, raise original error
                 log.debug("Neither master nor %s-%s exist." % (ctag, 
                     self.opts.node))
                 raise( ide )
         else:
             #node name was provided
             raise
     if self.opts.user:
         node.ssh.switch_user(self.opts.user)
     for rpath in rpaths:
         if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath):
             raise exception.BaseException(
                 "Remote file or directory does not exist: %s" % rpath)
     node.ssh.get(rpaths, lpath)
Exemplo n.º 23
0
    def get_stats(self):
        """
        this function will ssh to the SGE master and get load & queue stats.
        it will feed these stats to SGEStats, which parses the XML.
        it will return two arrays: one of hosts, each host has a hash with its
        host information inside. The job array contains a hash for every job,
        containing statistics about the job name, priority, etc
        """
        log.debug("starting get_stats")
        master = self._cluster.master_node
        self.stat = SGEStats()

        qhostXml = ""
        qstatXml = ""
        qacct = ""
        try:
            now = self.get_remote_time()
            qatime = self.get_qatime(now)
            qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime
            qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml'
            qhostXml = '\n'.join(master.ssh.execute( \
                'source /etc/profile && qhost -xml', log_output=False))
            qstatXml = '\n'.join(master.ssh.execute(qstat_cmd,
                                                    log_output=False))
            qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=False, \
                                                 ignore_exit_status=True))
        except Exception, e:
            log.error("Error occured getting SGE stats via ssh. "\
                      "Cluster terminated?")
            log.error(e)
            return -1
Exemplo n.º 24
0
 def connect(self,
             host=None,
             username=None,
             password=None,
             private_key=None,
             private_key_pass=None,
             port=22,
             timeout=30):
     host = host or self._host
     username = username or self._username
     password = password or self._password
     pkey = self._pkey
     if private_key:
         pkey = self.load_private_key(private_key, private_key_pass)
     log.debug("connecting to host %s on port %d as user %s" %
               (host, port, username))
     try:
         sock = self._get_socket(host, port)
         transport = ssh.Transport(sock)
         transport.banner_timeout = timeout
     except socket.error:
         raise exception.SSHConnectionError(host, port)
     # Authenticate the transport.
     try:
         transport.connect(username=username, pkey=pkey, password=password)
     except ssh.AuthenticationException:
         raise exception.SSHAuthException(username, host)
     except ssh.SSHException, e:
         msg = e.args[0]
         raise exception.SSHError(msg)
Exemplo n.º 25
0
    def editStartupScript(self, file, master):
        """
            Add entry in /etc/rc.local to run masterRestart.pl on boot
        """
        log.info("Adding entry to /etc/rc.local to run masterRestart on boot")
        log.debug("startup.StartUp.editStartupScript    self.installdir: %s ", self.installdir)
        if ( file == None or file == "" ):
            file = "/etc/rc.local"
        log.debug("startup.StartUp.editStartupScript    file: %s ", file)

        #### SET RUN resetMaster.pl COMMAND
        command = self.resetdir + "/resetMaster.pl " \
            + " --cell " + self.cell \
            + " --headnodeid " + self.headnodeid \
            + " --cgiscript " + "/cgi-bin/agua/reset.cgi"
        log.debug("startup.StartUp.editStartupScript    command: %s ", command)
        
        #### PRINT COMMAND TO FILE
        infilehandle = master.ssh.remote_file(file, 'r')
        contents = infilehandle.read()
        log.debug("startup.StartUp.editStartupScript    contents: %s ", contents)
        contents = string.replace(contents, "exit 0", "")
        contents = string.replace(contents, command, "")
        contents += command + "\n"
        contents += "\nexit 0\n"
        log.debug("startup.StartUp.editStartupScript    printing to %s contents: %s ", file, contents)

        outfilehandle = master.ssh.remote_file(file, 'w')
        outfilehandle.write(contents)
        outfilehandle.close()
Exemplo n.º 26
0
 def alias(self):
     """
     Fetches the node's alias stored in a tag from either the instance
     or the instance's parent spot request. If no alias tag is found an
     exception is raised.
     """
     if not self._alias:
         alias = self.tags.get('alias')
         if not alias:
             user_data = self._get_user_data(tries=5)
             aliases = user_data.split('|')
             index = self.ami_launch_index
             try:
                 alias = aliases[index]
             except IndexError:
                 log.debug("invalid user_data: %s (index: %d)" %
                           (aliases, index))
                 alias = None
             if not alias:
                 raise exception.BaseException("instance %s has no alias" %
                                               self.id)
             self.add_tag('alias', alias)
         name = self.tags.get('Name')
         if not name:
             self.add_tag('Name', alias)
         self._alias = alias
     return self._alias
Exemplo n.º 27
0
 def __init__(self, my_arg, my_other_arg, my_other_other_arg):
     self.my_arg = my_arg
     self.my_other_arg = my_other_arg
     self.my_other_other_arg = my_other_other_arg
     msg = "setupclass3: my_arg = %s, my_other_arg = %s"
     msg += " my_other_other_arg = %s"
     log.debug(msg % (my_arg, my_other_arg, my_other_other_arg))
Exemplo n.º 28
0
    def export_fs_to_nodes(self, nodes, export_paths):
        """
        Export each path in export_paths to each node in nodes via NFS

        nodes - list of nodes to export each path to
        export_paths - list of paths on this remote host to export to each node

        Example:
        # export /home and /opt/sge6 to each node in nodes
        $ node.start_nfs_server()
        $ node.export_fs_to_nodes(nodes=[node1,node2],
                                  export_paths=['/home', '/opt/sge6'])
        """
        log.debug("Cleaning up potentially stale NFS entries")
        self.stop_exporting_fs_to_nodes(nodes, paths=export_paths)
        log.info("Configuring NFS exports path(s):\n%s" %
                 ' '.join(export_paths))
        nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)"
        etc_exports = self.ssh.remote_file('/etc/exports', 'r')
        contents = etc_exports.read()
        etc_exports.close()
        etc_exports = self.ssh.remote_file('/etc/exports', 'a')
        for node in nodes:
            for path in export_paths:
                export_line = ' '.join(
                    [path, node.alias + nfs_export_settings + '\n'])
                if export_line not in contents:
                    etc_exports.write(export_line)
        etc_exports.close()
        self.ssh.execute('exportfs -fra')
Exemplo n.º 29
0
Arquivo: sge.py Projeto: agua/aguadev
    def on_add_node(self, node, nodes, master, user, user_shell, volumes):
        log.info("Doing 'on_add_node' for plugin: sge.CreateCell")
        log.info("Adding %s", node.alias)
        log.debug(
            "sge.CreateCell.on_add_node    CreateCell.on_add_node(self, node, nodes, master, user, user_shell, volumes)"
        )
        log.debug("sge.CreateCell.on_add_node    node.private_dns_name: %s" %
                  node.private_dns_name)

        #### SET HEAD NODE INTERNAL IP
        self.getHeadIp()

        #### ADD ENVIRONMENT VARIABLES TO /etc/profile ON MASTER
        self.addEnvarsToProfile(node)

        ##### CREATE NEW CELL DIRECTORY ON HEAD AND MASTER
        self.copyCell(node)

        ##### RESTART SGE ON NODE
        self.restartSge(node)

        #### ADD NODE TO @allhosts GROUP
        self.addToAllhosts(node, master)

        log.info("Completed 'on_add_node' for plugin: sge.CreateCell")
Exemplo n.º 30
0
 def _eval_remove_node(self):
     """
     This function uses the sge stats to decide whether or not to
     remove a node from the cluster.
     """
     qlen = len(self.stat.get_queued_jobs())
     if qlen != 0:
         return
     if not self.has_cluster_stabilized():
         return
     if len(self.stat.hosts) <= self.min_nodes:
         log.info("Not removing nodes: already at or below minimum (%d)"
                  % self.min_nodes)
         return
     max_remove = len(self.stat.hosts) - self.min_nodes
     log.info("Looking for nodes to remove...")
     remove_nodes = self._find_nodes_for_removal(max_remove=max_remove)
     if not remove_nodes:
         log.info("No nodes can be removed at this time")
     for node in remove_nodes:
         if node.update() != "running":
             log.error("Node %s is already dead - not removing" %
                       node.alias)
             continue
         log.warn("Removing %s: %s (%s)" %
                  (node.alias, node.id, node.dns_name))
         try:
             self._cluster.remove_node(node)
             self.__last_cluster_mod_time = datetime.datetime.utcnow()
         except Exception:
             log.error("Failed to remove node %s" % node.alias)
             log.debug(traceback.format_exc())
Exemplo n.º 31
0
 def alias(self):
     """
     Fetches the node's alias stored in a tag from either the instance
     or the instance's parent spot request. If no alias tag is found an
     exception is raised.
     """
     if not self._alias:
         alias = self.tags.get('alias')
         if not alias:
             user_data = self._get_user_data(tries=5)
             aliases = user_data.split('|')
             index = self.ami_launch_index
             try:
                 alias = aliases[index]
             except IndexError:
                 log.debug(
                     "invalid user_data: %s (index: %d)" % (aliases, index))
                 alias = None
             if not alias:
                 raise exception.BaseException(
                     "instance %s has no alias" % self.id)
             self.add_tag('alias', alias)
         name = self.tags.get('Name')
         if not name:
             self.add_tag('Name', alias)
         self._alias = alias
     return self._alias
Exemplo n.º 32
0
 def sftp(self):
     """Establish the SFTP connection."""
     if not self._sftp or self._sftp.sock.closed:
         log.debug("creating sftp connection")
         self._sftp = paramiko.SFTPClient.from_transport(self.transport)
         self._sftp.get_channel().settimeout(self._timeout)
     return self._sftp
Exemplo n.º 33
0
 def __init__(self, my_arg, my_other_arg, my_other_other_arg):
     self.my_arg = my_arg
     self.my_other_arg = my_other_arg
     self.my_other_other_arg = my_other_other_arg
     msg = "setupclass3: my_arg = %s, my_other_arg = %s"
     msg += " my_other_other_arg = %s"
     log.debug(msg % (my_arg, my_other_arg, my_other_other_arg))
Exemplo n.º 34
0
    def export_fs_to_nodes(self, nodes, export_paths):
        """
        Export each path in export_paths to each node in nodes via NFS

        nodes - list of nodes to export each path to
        export_paths - list of paths on this remote host to export to each node

        Example:
        # export /home and /opt/sge6 to each node in nodes
        $ node.start_nfs_server()
        $ node.export_fs_to_nodes(nodes=[node1,node2],
                                  export_paths=['/home', '/opt/sge6'])
        """
        log.debug("Cleaning up potentially stale NFS entries")
        self.stop_exporting_fs_to_nodes(nodes, paths=export_paths)
        log.info("Configuring NFS exports path(s):\n%s" %
                 ' '.join(export_paths))
        nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)"
        etc_exports = self.ssh.remote_file('/etc/exports', 'r')
        contents = etc_exports.read()
        etc_exports.close()
        etc_exports = self.ssh.remote_file('/etc/exports', 'a')
        for node in nodes:
            for path in export_paths:
                export_line = ' '.join(
                    [path, node.alias + nfs_export_settings + '\n'])
                if export_line not in contents:
                    etc_exports.write(export_line)
        etc_exports.close()
        self.ssh.execute('exportfs -fra')
Exemplo n.º 35
0
    def get_stats(self):
        """
        this function will ssh to the SGE master and get load & queue stats.
        it will feed these stats to SGEStats, which parses the XML.
        it will return two arrays: one of hosts, each host has a hash with its
        host information inside. The job array contains a hash for every job,
        containing statistics about the job name, priority, etc
        """
        log.debug("starting get_stats")
        master = self._cluster.master_node
        self.stat = SGEStats()

        qhostXml = ""
        qstatXml = ""
        qacct = ""
        try:
            now = self.get_remote_time()
            qatime = self.get_qatime(now)
            qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime
            qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml'
            qhostXml = '\n'.join(master.ssh.execute( \
                'source /etc/profile && qhost -xml', log_output=False))
            qstatXml = '\n'.join(
                master.ssh.execute(qstat_cmd, log_output=False))
            qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=False, \
                                                 ignore_exit_status=True))
        except Exception, e:
            log.error("Error occured getting SGE stats via ssh. "\
                      "Cluster terminated?")
            log.error(e)
            return -1
Exemplo n.º 36
0
 def get(self, remotepaths, localpath=''):
     """
     Copies one or more files from the remote host to the local host.
     """
     remotepaths = self._make_list(remotepaths)
     localpath = localpath or os.getcwd()
     globs = []
     noglobs = []
     for rpath in remotepaths:
         if glob.has_magic(rpath):
             globs.append(rpath)
         else:
             noglobs.append(rpath)
     globresults = [self.glob(g) for g in globs]
     remotepaths = noglobs
     for globresult in globresults:
         remotepaths.extend(globresult)
     recursive = False
     for rpath in remotepaths:
         if not self.path_exists(rpath):
             raise exception.BaseException(
                 "Remote file or directory does not exist: %s" % rpath)
     for rpath in remotepaths:
         if self.isdir(rpath):
             recursive = True
             break
     try:
         self.scp.get(remotepaths, local_path=localpath,
                      recursive=recursive)
     except Exception, e:
         log.debug("get failed: remotepaths=%s, localpath=%s",
                   str(remotepaths), localpath)
         raise exception.SCPException(str(e))
Exemplo n.º 37
0
 def alias(self):
     """
     Fetches the node's alias stored in a tag from either the instance
     or the instance's parent spot request. If no alias tag is found an
     exception is raised.
     """
     if not self._alias:
         alias = self.tags.get('alias')
         if not alias:
             aliasestxt = self.user_data.get(static.UD_ALIASES_FNAME)
             aliases = aliasestxt.splitlines()[2:]
             index = self.ami_launch_index
             try:
                 alias = aliases[index]
             except IndexError:
                 alias = None
                 log.debug("invalid aliases file in user_data:\n%s" %
                           aliasestxt)
             if not alias:
                 raise exception.BaseException(
                     "instance %s has no alias" % self.id)
             self.add_tag('alias', alias)
         if not self.tags.get('Name'):
             self.add_tag('Name', alias)
         self._alias = alias
     return self._alias
Exemplo n.º 38
0
 def sftp(self):
     """Establish the SFTP connection."""
     if not self._sftp or self._sftp.sock.closed:
         log.debug("creating sftp connection")
         self._sftp = paramiko.SFTPClient.from_transport(self.transport)
         self._sftp.get_channel().settimeout(self._timeout)
     return self._sftp
Exemplo n.º 39
0
    def mount_nfs_shares(self, server_node, remote_paths):
        """
        Mount each path in remote_paths from the remote server_node

        server_node - remote server node that is sharing the remote_paths
        remote_paths - list of remote paths to mount from server_node
        """
        self.ssh.execute('/etc/init.d/portmap start')
        # TODO: move this fix for xterm somewhere else
        self.ssh.execute('mount -t devpts none /dev/pts',
                         ignore_exit_status=True)
        mount_map = self.get_mount_map()
        mount_paths = []
        for path in remote_paths:
            network_device = "%s:%s" % (server_node.alias, path)
            if network_device in mount_map:
                mount_path, typ, options = mount_map.get(network_device)
                log.debug('nfs share %s already mounted to %s on '
                          'node %s, skipping...' %
                          (network_device, mount_path, self.alias))
            else:
                mount_paths.append(path)
        remote_paths = mount_paths
        remote_paths_regex = '|'.join(map(lambda x: x.center(len(x) + 2),
                                          remote_paths))
        self.ssh.remove_lines_from_file('/etc/fstab', remote_paths_regex)
        fstab = self.ssh.remote_file('/etc/fstab', 'a')
        for path in remote_paths:
            fstab.write('%s:%s %s nfs vers=3,user,rw,exec,noauto 0 0\n' %
                        (server_node.alias, path, path))
        fstab.close()
        for path in remote_paths:
            if not self.ssh.path_exists(path):
                self.ssh.makedirs(path)
            self.ssh.execute('mount %s' % path)
Exemplo n.º 40
0
 def connect(self, host=None, username=None, password=None,
             private_key=None, private_key_pass=None, port=None, timeout=30,
             compress=None):
     host = host or self._host
     username = username or self._username
     password = password or self._password
     compress = compress or self._compress
     port = port if port is not None else self._port
     pkey = self._pkey
     if private_key:
         pkey = self.load_private_key(private_key, private_key_pass)
     log.debug("connecting to host %s on port %d as user %s" % (host, port,
                                                                username))
     try:
         sock = self._get_socket(host, port)
         transport = paramiko.Transport(sock)
         transport.banner_timeout = timeout
     except socket.error:
         raise exception.SSHConnectionError(host, port)
     # Enable/disable compression
     transport.use_compression(compress)
     # Authenticate the transport.
     try:
         transport.connect(username=username, pkey=pkey, password=password)
     except paramiko.AuthenticationException:
         raise exception.SSHAuthException(username, host)
     except paramiko.SSHException, e:
         msg = e.args[0]
         raise exception.SSHError(msg)
Exemplo n.º 41
0
    def run(self, nodes, master, user, user_shell, volumes):
        """
            Mount NFS shares on master and all nodes
        """
        log.info("Running plugin automount")
        log.debug("automount.NfsShares.run    automount.NfsShares.run(nodes, master, user, user_shell, volumes)")

        #### OPEN NFS-RELATED PORTS FOR THIS CLUSTER
        self.openNfsPorts("default")
        self.openNfsPorts('@sc-' + self.cluster)

        #### SET HEAD NODE INTERNAL IP
        self.getHeadIp();

        #### FIX mountd PORT ON head AND MASTER/NODES
        mountdport = "32767"
        for node in nodes:
            self.setMountdOnNode(node, mountdport)
        
        self.setMountdOnHead(mountdport)
        self.restartServicesOnHead()

        #### MOUNT ON ALL NODES
        for node in nodes:
            self.mount(node)

        log.info("Completed plugin automount")
Exemplo n.º 42
0
 def setMountdOnNode(self, node, mountdport):
     """
         Fix mountd port to same number on all hosts - head, master and exec nodes
     """
     log.info("Setting mountd port on %s", node.alias)
     cmd = self.mountdCommand(mountdport)
     log.debug("Doing node.ssh.execute: " + cmd)
     node.ssh.execute(cmd)
Exemplo n.º 43
0
 def _load_rsa_key(self, private_key, private_key_pass=None):
     private_key_file = os.path.expanduser(private_key)
     try:
         rsa_key = get_rsa_key(key_location=private_key_file, passphrase=private_key_pass)
         log.debug("Using private key %s (RSA)" % private_key)
         return rsa_key
     except (paramiko.SSHException, exception.SSHError):
         log.error("invalid rsa key or passphrase specified")
Exemplo n.º 44
0
 def conn(self):
     if self._conn is None:
         log.debug('creating self._conn w/ connection_authenticator ' +
                   'kwargs = %s' % self._kwargs)
         self._conn = self.connection_authenticator(
             self.aws_access_key_id, self.aws_secret_access_key,
             **self._kwargs)
     return self._conn
Exemplo n.º 45
0
 def _stage_attrs(self, fileName, attrsDict):
     dir=self._create_tmp_dir()
     file="{dir}/{name}".format(dir=dir, name=fileName)
     log.debug("Checking for file %s", file)
     f = self.mssh.remote_file(file, mode="w")
     f.writelines(self._format_attrs(attrsDict))
     f.close()
     return file
Exemplo n.º 46
0
 def setMountdOnNode(self, node, mountdport):
     """
         Fix mountd port to same number on all hosts - head, master and exec nodes
     """
     log.info("Setting mountd port on %s", node.alias)
     cmd = self.mountdCommand(mountdport)
     log.debug("Doing node.ssh.execute: " + cmd)
     node.ssh.execute(cmd)
Exemplo n.º 47
0
 def conn(self):
     if self._conn is None:
         log.debug('creating self._conn w/ connection_authenticator kwargs' +
                   ' = %s' % self._kwargs)
         self._conn = self.connection_authenticator(
             self.aws_access_key, self.aws_secret_access_key, **self._kwargs
         )
     return self._conn
Exemplo n.º 48
0
 def run(self, nodes, master, user, user_shell, volumes):
     sudoCmd = 'sudo '
     if user == 'root':
         sudoCmd = ''
     for node in nodes:
         log.debug('run on %s: %s' % (node.alias, ("%sX :0 &" % sudoCmd)))
         node.ssh.execute("%sX :0 &" % sudoCmd)
         log.debug('run on %s: OK' % node.alias)
Exemplo n.º 49
0
 def _setup_etc_hosts(self, nodes=None):
     """ Configure /etc/hosts on all StarCluster nodes"""
     log.info("Configuring /etc/hosts on each node")
     nodes = nodes or self._nodes
     log.debug("Launching jobs " + str(datetime.datetime.utcnow()))
     for node in nodes:
         self.pool.simple_job(node.add_to_etc_hosts, (nodes, ),
                              jobid=node.alias)
     self.pool.wait(numtasks=len(nodes))
Exemplo n.º 50
0
 def createSettings(self, node):
     """
         Generate settings.sh file containing SGE_CELL, SGE_ROOT and port info
     """    
     log.info("Generating settings.sh file")
     log.debug("CreateCell.createSettings    CreateCell.createSettings(master)")
     cmd = self.settingsCommand()
     log.debug("CreateCell.createSettings    cmd: %s", cmd)
     node.ssh.execute(cmd)
Exemplo n.º 51
0
 def _addToFstab(self, node, sourcedir, sourceip, mountpoint, interval):
     """
         Add entries to /etc/fstab on master/exec nodes
     """
     log.info("Adding /etc/fstab entry (%s on %s)", mountpoint, node.alias)
     insert = self.head_ip + ":" + sourcedir + "  " + mountpoint + "  nfs  nfsvers=3,defaults 0 0"
     cmd = "echo '" + insert + "' >> /etc/fstab ;"
     log.debug(cmd)
     node.ssh.execute(cmd)
Exemplo n.º 52
0
 def _load_rsa_key(self, private_key, private_key_pass=None):
     private_key_file = os.path.expanduser(private_key)
     try:
         rsa_key = paramiko.RSAKey.from_private_key_file(private_key_file,
                                                         private_key_pass)
         log.debug("Using private key %s (rsa)" % private_key)
         return rsa_key
     except paramiko.SSHException:
         log.error('invalid rsa key or password specified')
Exemplo n.º 53
0
 def _addToFstab(self, node, sourcedir, sourceip, mountpoint, interval):            
     """
         Add entries to /etc/fstab on master/exec nodes
     """
     log.info("Adding /etc/fstab entry (%s on %s)", mountpoint, node.alias)
     insert = self.head_ip + ":" + sourcedir + "  " + mountpoint + "  nfs  nfsvers=3,defaults 0 0"
     cmd = "echo '" + insert + "' >> /etc/fstab ;"
     log.debug(cmd)
     node.ssh.execute(cmd)
Exemplo n.º 54
0
 def _load_rsa_key(self, private_key, private_key_pass=None):
     private_key_file = os.path.expanduser(private_key)
     try:
         rsa_key = get_rsa_key(key_location=private_key_file,
                               passphrase=private_key_pass)
         log.debug("Using private key %s (RSA)" % private_key)
         return rsa_key
     except (paramiko.SSHException, exception.SSHError):
         log.error('invalid rsa key or passphrase specified')
Exemplo n.º 55
0
 def switch_user(self, user):
     """
     Reconnect, if necessary, to host as user
     """
     if not self.is_active() or user and self.get_current_user() != user:
         self.connect(username=user)
     else:
         user = user or self._username
         log.debug("already connected as user %s" % user)
Exemplo n.º 56
0
 def _load_rsa_key(self, private_key, private_key_pass=None):
     private_key_file = os.path.expanduser(private_key)
     try:
         rsa_key = paramiko.RSAKey.from_private_key_file(
             private_key_file, private_key_pass)
         log.debug("Using private key %s (rsa)" % private_key)
         return rsa_key
     except paramiko.SSHException:
         log.error('invalid rsa key or password specified')
Exemplo n.º 57
0
 def switch_user(self, user):
     """
     Reconnect, if necessary, to host as user
     """
     if not self.is_active() or user and self.get_current_user() != user:
         self.connect(username=user)
     else:
         user = user or self._username
         log.debug("already connected as user %s" % user)