예제 #1
0
    def test_03_start_slurm(self):
        core.config['slurm.service-name'] = 'slurm'
        if core.el_release() == 7:
            core.config['slurm.service-name'] += 'd'
            core.config['slurm.ctld-service-name'] = 'slurmctld'
        core.state['%s.started-service' % core.config['slurm.service-name']] = False
        self.slurm_reqs()
        self.skip_ok_if(service.is_running(core.config['slurm.service-name']), 'slurm already running')

        stat = core.get_stat(CTLD_LOG)

        if core.el_release() == 7:
            # slurmctld is handled by /etc/init.d/slurm on EL6
            command = ['slurmctld']
            core.check_system(command, 'enable slurmctld')
            service.check_start(core.config['slurm.service-name'])
            service.check_start(core.config['slurm.ctld-service-name'])
        else:
            service.check_start(core.config['slurm.service-name'])

        core.monitor_file(CTLD_LOG,
                          stat,
                          'slurm_rpc_node_registration complete for %s' % SHORT_HOSTNAME,
                          60.0)
        log_stat = core.get_stat(SLURM_LOG)
        core.monitor_file(SLURM_LOG,
                          log_stat,
                          'slurmd started',
                          60.0)
        command = ['scontrol', 'update', 'nodename=%s' % SHORT_HOSTNAME, 'state=idle']
        core.check_system(command, 'enable slurm node')
예제 #2
0
    def test_05_start_pbs(self):
        core.state['pbs_server.started-service'] = False
        core.state['torque.nodes-up'] = False

        core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True)
        self.skip_bad_unless(service.is_running('trqauthd'), 'pbs_server requires trqauthd')
        self.skip_ok_if(service.is_running('pbs_server'), 'pbs server already running')

        server_log = '/var/log/torque/server_logs/' + date.today().strftime('%Y%m%d')
        try:
            server_log_stat = os.stat(server_log)
        except OSError:
            server_log_stat = None

        service.check_start('pbs_server')

        # Wait until the server is up before writing the rest of the config
        core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0)
        core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()),
                          "Configuring pbs server",
                          shell=True)

        # wait up to 5 minutes for the server to recognize the node
        start_time = time.time()
        while (time.time() - start_time) < 600:
            command = ('/usr/bin/qnodes', '-s', core.get_hostname())
            stdout, _, fail = core.check_system(command, 'Get pbs node info')
            self.assert_(stdout.find('error') == -1, fail)
            if stdout.find('state = free'):
                core.state['torque.nodes-up'] = True
                break
        if not core.state['torque.nodes-up']:
            self.fail('PBS nodes not coming up')
예제 #3
0
    def test_03_start_slurm(self):
        core.config['slurm.service-name'] = 'slurm'
        if core.el_release() == 7:
            core.config['slurm.service-name'] += 'd'
            core.config['slurm.ctld-service-name'] = 'slurmctld'
        core.state['%s.started-service' %
                   core.config['slurm.service-name']] = False
        self.slurm_reqs()
        self.skip_ok_if(service.is_running(core.config['slurm.service-name']),
                        'slurm already running')

        stat = core.get_stat(CTLD_LOG)

        if core.el_release() == 7:
            # slurmctld is handled by /etc/init.d/slurm on EL6
            command = ['slurmctld']
            core.check_system(command, 'enable slurmctld')
            service.check_start(core.config['slurm.service-name'])
            service.check_start(core.config['slurm.ctld-service-name'])
        else:
            service.check_start(core.config['slurm.service-name'])

        core.monitor_file(
            CTLD_LOG, stat,
            'slurm_rpc_node_registration complete for %s' % SHORT_HOSTNAME,
            60.0)
        log_stat = core.get_stat(SLURM_LOG)
        core.monitor_file(SLURM_LOG, log_stat, 'slurmd started', 60.0)
        command = [
            'scontrol', 'update',
            'nodename=%s' % SHORT_HOSTNAME, 'state=idle'
        ]
        core.check_system(command, 'enable slurm node')
예제 #4
0
    def test_05_start_tomcat(self):
        core.skip_ok_unless_installed(tomcat.pkgname())
        core.state['tomcat.started'] = False
        catalina_log = tomcat.catalinafile()

        initial_stat = core.get_stat(catalina_log)

        tomcat_sentinel = r'Server startup in \d+ ms'
        # Bump log level
        core.config['tomcat.logging-conf'] = os.path.join(tomcat.sysconfdir(), 'logging.properties')
        files.append(core.config['tomcat.logging-conf'], 'org.apache.catalina.level = %s\n' % 'FINER',
                     owner='tomcat', backup=True)

        old_str  =  "1catalina.org.apache.juli.FileHandler.prefix = catalina."
        repl_str = ("1catalina.org.apache.juli.FileHandler.prefix = catalina\n"
                    "1catalina.org.apache.juli.FileHandler.rotatable = false")
        files.replace(core.config['tomcat.logging-conf'], old_str, repl_str,
                      owner='tomcat', backup=False)

        service.check_start(tomcat.pkgname())
        if core.options.nightly:
            timeout = 3600.0
        else:
            timeout = 1200.0
        line, gap = core.monitor_file(catalina_log, initial_stat, tomcat_sentinel, timeout)
        self.assert_(line is not None, 'Tomcat did not start within the %d min window' % int(timeout/60))
        core.state['tomcat.started'] = True
        core.log_message('Tomcat started after %.1f seconds' % gap)
예제 #5
0
    def isProbeInfoProcessed(self, ProbePattern):
        """This helper method parses gratia log for patterns signifying that Gratia has processed the probe information
    A. It loops through the lines with the pattern 'RecordProcessor: 0: ProbeDetails'
    B. Examines the output line to check if it contains the passed in Probe specific pattern, / AND the word saved
    Sample target lines from a gratia log is:
       2013-07-14 17:21:48,073 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 9 / 9
       (gridftp-transfer:fermicloud101.fnal.gov, recordId= Record
       (Id: fermicloud101.fnal.gov:3274.0 CreateTime: 14 July 2013 at 22:21:37 GMT KeyInfo: null) ) saved.

        2013-07-14 17:22:18,161 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 5 / 5
        (glexec:fermicloud101.fnal.gov, recordId= Record
        (Id: fermicloud101.fnal.gov:3299.0 CreateTime: 14 July 2013 at 22:21:48 GMT KeyInfo: null) ) saved.

        2013-07-14 17:23:18,294 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 2 / 2
        (condor:fermicloud101.fnal.gov, recordId= Record
        (Id: fermicloud101.fnal.gov:3390.0 CreateTime: 14 July 2013 at 22:22:48 GMT KeyInfo: null) ) saved.

        2013-07-14 17:24:50,465 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 31 / 31
        (pbs-lsf:fermicloud101.fnal.gov, recordId= Record
        (Id: fermicloud101.fnal.gov:4549.0 CreateTime: 14 July 2013 at 22:24:19 GMT KeyInfo: null) ) saved. """



        record_re = '.*' + 'RecordProcessor: 0: ProbeDetails' + '.*' + '/' + '.*' + ProbePattern + '.*' + 'saved'
        line, gap = core.monitor_file(core.config['gratia.log.file'], core.state['gratia.log.stat'], record_re, 600.0)
        if line is not None:
            core.log_message('Gratia processed probe data - Time taken is %.1f seconds' % gap)
            core.log_message('Gratia processed probe data - Line is ' + str(line))
            return True
        else:
            core.log_message('Did not find the search pattern within the given time limit.')
            return False
예제 #6
0
def wait_for_daemon(collector_log_path, stat, daemon, timeout):
    """Wait until the requested 'daemon' is available and accepting commands by
    monitoring the specified CollectorLog from the position specified by 'stat'
    for a maximum of 'timeout' seconds. Returns True if the daemon becomes
    available within the timeout period and False, otherwise.

    """
    sentinel = r'%sAd\s+:\s+Inserting' % daemon.capitalize()
    return bool(core.monitor_file(collector_log_path, stat, sentinel, timeout)[0])
예제 #7
0
    def test_01_wait_for_voms_admin(self):
        core.state['voms.started-webapp'] = False

        core.skip_ok_unless_installed('voms-admin-server')

        line, gap = core.monitor_file(core.config['voms.webapp-log'], core.state['voms.webapp-log-stat'],
                                      'VOMS-Admin started succesfully', 120.0)
        self.assert_(line is not None, 'VOMS Admin webapp started')
        core.state['voms.started-webapp'] = True
        core.log_message('VOMS Admin started after %.1f seconds' % gap)
예제 #8
0
    def test_01_wait_for_voms_admin(self):
        core.state['voms.started-webapp'] = False

        if core.missing_rpm('voms-admin-server'):
            return

        line, gap = core.monitor_file(core.config['voms.webapp-log'],
                                      core.state['voms.webapp-log-stat'],
                                      'VOMS-Admin started succesfully', 60.0)
        self.assert_(line is not None, 'VOMS Admin webapp started')
        core.state['voms.started-webapp'] = True
        core.log_message('VOMS Admin started after %.1f seconds' % gap)
예제 #9
0
    def test_05_start_pbs(self):
        core.state['pbs_server.started-service'] = False
        core.state['torque.nodes-up'] = False

        core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True)
        self.skip_bad_unless(service.is_running('trqauthd'),
                             'pbs_server requires trqauthd')
        self.skip_ok_if(service.is_running('pbs_server'),
                        'pbs server already running')

        server_log = '/var/log/torque/server_logs/' + date.today().strftime(
            '%Y%m%d')
        try:
            server_log_stat = os.stat(server_log)
        except OSError:
            server_log_stat = None

        service.check_start('pbs_server')

        # Wait until the server is up before writing the rest of the config
        core.monitor_file(server_log, server_log_stat, '.*Server Ready.*',
                          60.0)
        core.check_system("echo '%s' | qmgr %s" %
                          (self.pbs_config, core.get_hostname()),
                          "Configuring pbs server",
                          shell=True)

        # wait up to 5 minutes for the server to recognize the node
        start_time = time.time()
        while (time.time() - start_time) < 600:
            command = ('/usr/bin/qnodes', '-s', core.get_hostname())
            stdout, _, fail = core.check_system(command, 'Get pbs node info')
            self.assert_(stdout.find('error') == -1, fail)
            if stdout.find('state = free'):
                core.state['torque.nodes-up'] = True
                break
        if not core.state['torque.nodes-up']:
            self.fail('PBS nodes not coming up')
예제 #10
0
    def test_02_start_slurmdbd(self):
        core.state['slurmdbd.started-service'] = False
        core.state['slurmdbd.ready'] = False
        self.slurm_reqs()
        self.skip_bad_unless(mysql.is_running(), 'slurmdbd requires mysql')
        core.config['slurmdbd.config'] = os.path.join(
            core.config['slurm.config-dir'], 'slurmdbd.conf')
        core.config['slurmdbd.user'] = "******"
        core.config['slurmdbd.name'] = "osg_test_slurmdb"

        mysql.check_execute(
            "create database %s; " % core.config['slurmdbd.name'],
            'create slurmdb')
        mysql.check_execute("create user %s; " % core.config['slurmdbd.user'],
                            'add slurmdb user')
        mysql.check_execute(
            "grant usage on *.* to %s; " % core.config['slurmdbd.user'],
            'slurmdb user access')
        mysql.check_execute(
            "grant all privileges on %s.* to %s identified by '%s'; " %
            (core.config['slurmdbd.name'], core.config['slurmdbd.user'],
             core.options.password), 'slurmdb user permissions')
        mysql.check_execute("flush privileges;", 'reload privileges')

        files.write(core.config['slurmdbd.config'],
                    SLURMDBD_CONFIG.format(
                        name=core.config['slurmdbd.name'],
                        user=core.config['slurmdbd.user'].split('\'')[1],
                        password=core.options.password,
                        port=mysql.PORT),
                    owner='slurm',
                    chmod=0o644)

        stat = core.get_stat(SLURMDBD_LOG)
        service.check_start('slurmdbd')
        sentinel = core.monitor_file(SLURMDBD_LOG, stat,
                                     'slurmdbd version.+started', 30.0)
        if sentinel:
            core.state['slurmdbd.ready'] = True

        # Adding the cluster to the database
        command = ('sacctmgr', '-i', 'add', 'cluster', CLUSTER_NAME)
        core.check_system(command, 'add slurm cluster')
예제 #11
0
    def test_05_start_tomcat(self):
        core.skip_ok_unless_installed(tomcat.pkgname())
        core.state['tomcat.started'] = False
        catalina_log = tomcat.catalinafile()

        initial_stat = core.get_stat(catalina_log)

        tomcat_sentinel = r'Server startup in \d+ ms'
        # Bump log level
        core.config['tomcat.logging-conf'] = os.path.join(
            tomcat.sysconfdir(), 'logging.properties')
        files.append(core.config['tomcat.logging-conf'],
                     'org.apache.catalina.level = %s\n' % 'FINER',
                     owner='tomcat',
                     backup=True)

        old_str = "1catalina.org.apache.juli.FileHandler.prefix = catalina."
        repl_str = ("1catalina.org.apache.juli.FileHandler.prefix = catalina\n"
                    "1catalina.org.apache.juli.FileHandler.rotatable = false")
        files.replace(core.config['tomcat.logging-conf'],
                      old_str,
                      repl_str,
                      owner='tomcat',
                      backup=False)

        service.check_start(tomcat.pkgname())
        if core.options.nightly:
            timeout = 3600.0
        else:
            timeout = 1200.0
        line, gap = core.monitor_file(catalina_log, initial_stat,
                                      tomcat_sentinel, timeout)
        self.assert_(
            line is not None, 'Tomcat did not start within the %d min window' %
            int(timeout / 60))
        core.state['tomcat.started'] = True
        core.log_message('Tomcat started after %.1f seconds' % gap)
예제 #12
0
    def isProbeInfoProcessed(self, ProbePattern):
        """This helper method parses gratia log for patterns signifying that Gratia has processed the probe information
    A. It loops through the lines with the pattern 'RecordProcessor: 0: ProbeDetails'
    B. Examines the output line to check if it contains the passed in Probe specific pattern, / AND the word saved
    Sample target lines from a gratia log is:
       2013-07-14 17:21:48,073 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 9 / 9
       (gridftp-transfer:fermicloud101.fnal.gov, recordId= Record
       (Id: fermicloud101.fnal.gov:3274.0 CreateTime: 14 July 2013 at 22:21:37 GMT KeyInfo: null) ) saved.

        2013-07-14 17:22:18,161 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 5 / 5
        (glexec:fermicloud101.fnal.gov, recordId= Record
        (Id: fermicloud101.fnal.gov:3299.0 CreateTime: 14 July 2013 at 22:21:48 GMT KeyInfo: null) ) saved.

        2013-07-14 17:23:18,294 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 2 / 2
        (condor:fermicloud101.fnal.gov, recordId= Record
        (Id: fermicloud101.fnal.gov:3390.0 CreateTime: 14 July 2013 at 22:22:48 GMT KeyInfo: null) ) saved.

        2013-07-14 17:24:50,465 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 31 / 31
        (pbs-lsf:fermicloud101.fnal.gov, recordId= Record
        (Id: fermicloud101.fnal.gov:4549.0 CreateTime: 14 July 2013 at 22:24:19 GMT KeyInfo: null) ) saved. """

        record_re = '.*' + 'RecordProcessor: 0: ProbeDetails' + '.*' + '/' + '.*' + ProbePattern + '.*' + 'saved'
        line, gap = core.monitor_file(core.config['gratia.log.file'],
                                      core.state['gratia.log.stat'], record_re,
                                      600.0)
        if line is not None:
            core.log_message(
                'Gratia processed probe data - Time taken is %.1f seconds' %
                gap)
            core.log_message('Gratia processed probe data - Line is ' +
                             str(line))
            return True
        else:
            core.log_message(
                'Did not find the search pattern within the given time limit.')
            return False
예제 #13
0
    def test_04_start_pbs(self):
        if core.el_release() <= 6:
            core.config['torque.pbs-lockfile'] = '/var/lock/subsys/pbs_server'
        else:
            core.config['torque.pbs-lockfile'] = '/var/lib/torque/server_priv/server.lock'
        core.state['trqauthd.started-service'] = False
        core.state['torque.pbs-server-running'] = False
        core.state['torque.pbs-server-started'] = False
        core.state['torque.pbs-configured'] = False
        core.state['torque.nodes-up'] = False
        core.config['torque.pbs-nodes-file'] = '/var/lib/torque/server_priv/nodes'
        core.config['torque.pbs-servername-file'] = '/var/lib/torque/server_name'

        core.skip_ok_unless_installed(*self.required_rpms)
        if os.path.exists(core.config['torque.pbs-lockfile']):
            core.state['torque.pbs-server-running'] = True
            self.skip_ok('pbs server apparently running')

        # set hostname as servername instead of localhost
        files.write(core.config['torque.pbs-servername-file'],
                    "%s" % core.get_hostname(),
                    owner='pbs')
        core.state['torque.pbs-configured'] = True

        # trqauthd is required for the pbs_server
        service.start('trqauthd')

        if not os.path.exists('/var/lib/torque/server_priv/serverdb'):
            if core.el_release() <= 6:
                command = 'service pbs_server create' # this creates the default config and starts the service
            else:
                # XXX: "service pbs_server create" doesn't work for systemd, and I haven't found a
                #      systemd equivalent to do the "create" step in el7 ... The following was
                #      distilled from the el6 init.d script:  (but please correct as necessary)
                command = ('/usr/sbin/pbs_server -d /var/lib/torque -t create -f && '
                           'sleep 10 && /usr/bin/qterm')

            stdout, _, fail = core.check_system(command, 'create initial pbs serverdb config', shell=True)
            self.assert_(stdout.find('error') == -1, fail)

        # This gets wiped if we write it before the initial 'service pbs_server create'
        # However, this file needs to be in place before the service is started so we
        # restart the service after 'initial configuration'
        files.write(core.config['torque.pbs-nodes-file'], # add the local node as a compute node
                    "%s np=1 num_node_boards=1\n" % core.get_hostname(),
                    owner='pbs')

        # Sometimes the restart command throws an error on stop but still manages
        # to kill the service, meaning that the service doesn't get brought back up
        command = ('service', 'pbs_server', 'stop')
        core.system(command, 'stop pbs server daemon')

        server_log = '/var/log/torque/server_logs/' + date.today().strftime('%Y%m%d')
        try:
            server_log_stat = os.stat(server_log)
        except OSError:
            server_log_stat = None

        command = ('service', 'pbs_server', 'start')
        stdout, _, fail = core.check_system(command, 'Start pbs server daemon')
        self.assert_(stdout.find('error') == -1, fail)
        self.assert_(os.path.exists(core.config['torque.pbs-lockfile']),
                     'pbs server run lock file missing')
        core.state['torque.pbs-server-started'] = True
        core.state['torque.pbs-server-running'] = True

        # Wait until the server is up before writing the rest of the config
        core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0)
        core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()),
                          "Configuring pbs server",
                          shell=True)

        # wait up to 5 minutes for the server to recognize the node
        start_time = time.time()
        while (time.time() - start_time) < 600:
            command = ('/usr/bin/qnodes', '-s', core.get_hostname())
            stdout, _, fail = core.check_system(command, 'Get pbs node info')
            self.assert_(stdout.find('error') == -1, fail)
            if stdout.find('state = free'):
                core.state['torque.nodes-up'] = True
                break
        if not core.state['torque.nodes-up']:
            self.fail('PBS nodes not coming up')