def test_03_start_slurm(self): core.config['slurm.service-name'] = 'slurm' if core.el_release() == 7: core.config['slurm.service-name'] += 'd' core.config['slurm.ctld-service-name'] = 'slurmctld' core.state['%s.started-service' % core.config['slurm.service-name']] = False self.slurm_reqs() self.skip_ok_if(service.is_running(core.config['slurm.service-name']), 'slurm already running') stat = core.get_stat(CTLD_LOG) if core.el_release() == 7: # slurmctld is handled by /etc/init.d/slurm on EL6 command = ['slurmctld'] core.check_system(command, 'enable slurmctld') service.check_start(core.config['slurm.service-name']) service.check_start(core.config['slurm.ctld-service-name']) else: service.check_start(core.config['slurm.service-name']) core.monitor_file(CTLD_LOG, stat, 'slurm_rpc_node_registration complete for %s' % SHORT_HOSTNAME, 60.0) log_stat = core.get_stat(SLURM_LOG) core.monitor_file(SLURM_LOG, log_stat, 'slurmd started', 60.0) command = ['scontrol', 'update', 'nodename=%s' % SHORT_HOSTNAME, 'state=idle'] core.check_system(command, 'enable slurm node')
def test_05_start_pbs(self): core.state['pbs_server.started-service'] = False core.state['torque.nodes-up'] = False core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_bad_unless(service.is_running('trqauthd'), 'pbs_server requires trqauthd') self.skip_ok_if(service.is_running('pbs_server'), 'pbs server already running') server_log = '/var/log/torque/server_logs/' + date.today().strftime('%Y%m%d') try: server_log_stat = os.stat(server_log) except OSError: server_log_stat = None service.check_start('pbs_server') # Wait until the server is up before writing the rest of the config core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0) core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()), "Configuring pbs server", shell=True) # wait up to 5 minutes for the server to recognize the node start_time = time.time() while (time.time() - start_time) < 600: command = ('/usr/bin/qnodes', '-s', core.get_hostname()) stdout, _, fail = core.check_system(command, 'Get pbs node info') self.assert_(stdout.find('error') == -1, fail) if stdout.find('state = free'): core.state['torque.nodes-up'] = True break if not core.state['torque.nodes-up']: self.fail('PBS nodes not coming up')
def test_03_start_slurm(self): core.config['slurm.service-name'] = 'slurm' if core.el_release() == 7: core.config['slurm.service-name'] += 'd' core.config['slurm.ctld-service-name'] = 'slurmctld' core.state['%s.started-service' % core.config['slurm.service-name']] = False self.slurm_reqs() self.skip_ok_if(service.is_running(core.config['slurm.service-name']), 'slurm already running') stat = core.get_stat(CTLD_LOG) if core.el_release() == 7: # slurmctld is handled by /etc/init.d/slurm on EL6 command = ['slurmctld'] core.check_system(command, 'enable slurmctld') service.check_start(core.config['slurm.service-name']) service.check_start(core.config['slurm.ctld-service-name']) else: service.check_start(core.config['slurm.service-name']) core.monitor_file( CTLD_LOG, stat, 'slurm_rpc_node_registration complete for %s' % SHORT_HOSTNAME, 60.0) log_stat = core.get_stat(SLURM_LOG) core.monitor_file(SLURM_LOG, log_stat, 'slurmd started', 60.0) command = [ 'scontrol', 'update', 'nodename=%s' % SHORT_HOSTNAME, 'state=idle' ] core.check_system(command, 'enable slurm node')
def test_05_start_tomcat(self): core.skip_ok_unless_installed(tomcat.pkgname()) core.state['tomcat.started'] = False catalina_log = tomcat.catalinafile() initial_stat = core.get_stat(catalina_log) tomcat_sentinel = r'Server startup in \d+ ms' # Bump log level core.config['tomcat.logging-conf'] = os.path.join(tomcat.sysconfdir(), 'logging.properties') files.append(core.config['tomcat.logging-conf'], 'org.apache.catalina.level = %s\n' % 'FINER', owner='tomcat', backup=True) old_str = "1catalina.org.apache.juli.FileHandler.prefix = catalina." repl_str = ("1catalina.org.apache.juli.FileHandler.prefix = catalina\n" "1catalina.org.apache.juli.FileHandler.rotatable = false") files.replace(core.config['tomcat.logging-conf'], old_str, repl_str, owner='tomcat', backup=False) service.check_start(tomcat.pkgname()) if core.options.nightly: timeout = 3600.0 else: timeout = 1200.0 line, gap = core.monitor_file(catalina_log, initial_stat, tomcat_sentinel, timeout) self.assert_(line is not None, 'Tomcat did not start within the %d min window' % int(timeout/60)) core.state['tomcat.started'] = True core.log_message('Tomcat started after %.1f seconds' % gap)
def isProbeInfoProcessed(self, ProbePattern): """This helper method parses gratia log for patterns signifying that Gratia has processed the probe information A. It loops through the lines with the pattern 'RecordProcessor: 0: ProbeDetails' B. Examines the output line to check if it contains the passed in Probe specific pattern, / AND the word saved Sample target lines from a gratia log is: 2013-07-14 17:21:48,073 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 9 / 9 (gridftp-transfer:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:3274.0 CreateTime: 14 July 2013 at 22:21:37 GMT KeyInfo: null) ) saved. 2013-07-14 17:22:18,161 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 5 / 5 (glexec:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:3299.0 CreateTime: 14 July 2013 at 22:21:48 GMT KeyInfo: null) ) saved. 2013-07-14 17:23:18,294 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 2 / 2 (condor:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:3390.0 CreateTime: 14 July 2013 at 22:22:48 GMT KeyInfo: null) ) saved. 2013-07-14 17:24:50,465 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 31 / 31 (pbs-lsf:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:4549.0 CreateTime: 14 July 2013 at 22:24:19 GMT KeyInfo: null) ) saved. """ record_re = '.*' + 'RecordProcessor: 0: ProbeDetails' + '.*' + '/' + '.*' + ProbePattern + '.*' + 'saved' line, gap = core.monitor_file(core.config['gratia.log.file'], core.state['gratia.log.stat'], record_re, 600.0) if line is not None: core.log_message('Gratia processed probe data - Time taken is %.1f seconds' % gap) core.log_message('Gratia processed probe data - Line is ' + str(line)) return True else: core.log_message('Did not find the search pattern within the given time limit.') return False
def wait_for_daemon(collector_log_path, stat, daemon, timeout): """Wait until the requested 'daemon' is available and accepting commands by monitoring the specified CollectorLog from the position specified by 'stat' for a maximum of 'timeout' seconds. Returns True if the daemon becomes available within the timeout period and False, otherwise. """ sentinel = r'%sAd\s+:\s+Inserting' % daemon.capitalize() return bool(core.monitor_file(collector_log_path, stat, sentinel, timeout)[0])
def test_01_wait_for_voms_admin(self): core.state['voms.started-webapp'] = False core.skip_ok_unless_installed('voms-admin-server') line, gap = core.monitor_file(core.config['voms.webapp-log'], core.state['voms.webapp-log-stat'], 'VOMS-Admin started succesfully', 120.0) self.assert_(line is not None, 'VOMS Admin webapp started') core.state['voms.started-webapp'] = True core.log_message('VOMS Admin started after %.1f seconds' % gap)
def test_01_wait_for_voms_admin(self): core.state['voms.started-webapp'] = False if core.missing_rpm('voms-admin-server'): return line, gap = core.monitor_file(core.config['voms.webapp-log'], core.state['voms.webapp-log-stat'], 'VOMS-Admin started succesfully', 60.0) self.assert_(line is not None, 'VOMS Admin webapp started') core.state['voms.started-webapp'] = True core.log_message('VOMS Admin started after %.1f seconds' % gap)
def test_05_start_pbs(self): core.state['pbs_server.started-service'] = False core.state['torque.nodes-up'] = False core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_bad_unless(service.is_running('trqauthd'), 'pbs_server requires trqauthd') self.skip_ok_if(service.is_running('pbs_server'), 'pbs server already running') server_log = '/var/log/torque/server_logs/' + date.today().strftime( '%Y%m%d') try: server_log_stat = os.stat(server_log) except OSError: server_log_stat = None service.check_start('pbs_server') # Wait until the server is up before writing the rest of the config core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0) core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()), "Configuring pbs server", shell=True) # wait up to 5 minutes for the server to recognize the node start_time = time.time() while (time.time() - start_time) < 600: command = ('/usr/bin/qnodes', '-s', core.get_hostname()) stdout, _, fail = core.check_system(command, 'Get pbs node info') self.assert_(stdout.find('error') == -1, fail) if stdout.find('state = free'): core.state['torque.nodes-up'] = True break if not core.state['torque.nodes-up']: self.fail('PBS nodes not coming up')
def test_02_start_slurmdbd(self): core.state['slurmdbd.started-service'] = False core.state['slurmdbd.ready'] = False self.slurm_reqs() self.skip_bad_unless(mysql.is_running(), 'slurmdbd requires mysql') core.config['slurmdbd.config'] = os.path.join( core.config['slurm.config-dir'], 'slurmdbd.conf') core.config['slurmdbd.user'] = "******" core.config['slurmdbd.name'] = "osg_test_slurmdb" mysql.check_execute( "create database %s; " % core.config['slurmdbd.name'], 'create slurmdb') mysql.check_execute("create user %s; " % core.config['slurmdbd.user'], 'add slurmdb user') mysql.check_execute( "grant usage on *.* to %s; " % core.config['slurmdbd.user'], 'slurmdb user access') mysql.check_execute( "grant all privileges on %s.* to %s identified by '%s'; " % (core.config['slurmdbd.name'], core.config['slurmdbd.user'], core.options.password), 'slurmdb user permissions') mysql.check_execute("flush privileges;", 'reload privileges') files.write(core.config['slurmdbd.config'], SLURMDBD_CONFIG.format( name=core.config['slurmdbd.name'], user=core.config['slurmdbd.user'].split('\'')[1], password=core.options.password, port=mysql.PORT), owner='slurm', chmod=0o644) stat = core.get_stat(SLURMDBD_LOG) service.check_start('slurmdbd') sentinel = core.monitor_file(SLURMDBD_LOG, stat, 'slurmdbd version.+started', 30.0) if sentinel: core.state['slurmdbd.ready'] = True # Adding the cluster to the database command = ('sacctmgr', '-i', 'add', 'cluster', CLUSTER_NAME) core.check_system(command, 'add slurm cluster')
def test_05_start_tomcat(self): core.skip_ok_unless_installed(tomcat.pkgname()) core.state['tomcat.started'] = False catalina_log = tomcat.catalinafile() initial_stat = core.get_stat(catalina_log) tomcat_sentinel = r'Server startup in \d+ ms' # Bump log level core.config['tomcat.logging-conf'] = os.path.join( tomcat.sysconfdir(), 'logging.properties') files.append(core.config['tomcat.logging-conf'], 'org.apache.catalina.level = %s\n' % 'FINER', owner='tomcat', backup=True) old_str = "1catalina.org.apache.juli.FileHandler.prefix = catalina." repl_str = ("1catalina.org.apache.juli.FileHandler.prefix = catalina\n" "1catalina.org.apache.juli.FileHandler.rotatable = false") files.replace(core.config['tomcat.logging-conf'], old_str, repl_str, owner='tomcat', backup=False) service.check_start(tomcat.pkgname()) if core.options.nightly: timeout = 3600.0 else: timeout = 1200.0 line, gap = core.monitor_file(catalina_log, initial_stat, tomcat_sentinel, timeout) self.assert_( line is not None, 'Tomcat did not start within the %d min window' % int(timeout / 60)) core.state['tomcat.started'] = True core.log_message('Tomcat started after %.1f seconds' % gap)
def isProbeInfoProcessed(self, ProbePattern): """This helper method parses gratia log for patterns signifying that Gratia has processed the probe information A. It loops through the lines with the pattern 'RecordProcessor: 0: ProbeDetails' B. Examines the output line to check if it contains the passed in Probe specific pattern, / AND the word saved Sample target lines from a gratia log is: 2013-07-14 17:21:48,073 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 9 / 9 (gridftp-transfer:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:3274.0 CreateTime: 14 July 2013 at 22:21:37 GMT KeyInfo: null) ) saved. 2013-07-14 17:22:18,161 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 5 / 5 (glexec:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:3299.0 CreateTime: 14 July 2013 at 22:21:48 GMT KeyInfo: null) ) saved. 2013-07-14 17:23:18,294 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 2 / 2 (condor:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:3390.0 CreateTime: 14 July 2013 at 22:22:48 GMT KeyInfo: null) ) saved. 2013-07-14 17:24:50,465 gratia.service(Thread-66) [FINE]: RecordProcessor: 0: ProbeDetails 31 / 31 (pbs-lsf:fermicloud101.fnal.gov, recordId= Record (Id: fermicloud101.fnal.gov:4549.0 CreateTime: 14 July 2013 at 22:24:19 GMT KeyInfo: null) ) saved. """ record_re = '.*' + 'RecordProcessor: 0: ProbeDetails' + '.*' + '/' + '.*' + ProbePattern + '.*' + 'saved' line, gap = core.monitor_file(core.config['gratia.log.file'], core.state['gratia.log.stat'], record_re, 600.0) if line is not None: core.log_message( 'Gratia processed probe data - Time taken is %.1f seconds' % gap) core.log_message('Gratia processed probe data - Line is ' + str(line)) return True else: core.log_message( 'Did not find the search pattern within the given time limit.') return False
def test_04_start_pbs(self): if core.el_release() <= 6: core.config['torque.pbs-lockfile'] = '/var/lock/subsys/pbs_server' else: core.config['torque.pbs-lockfile'] = '/var/lib/torque/server_priv/server.lock' core.state['trqauthd.started-service'] = False core.state['torque.pbs-server-running'] = False core.state['torque.pbs-server-started'] = False core.state['torque.pbs-configured'] = False core.state['torque.nodes-up'] = False core.config['torque.pbs-nodes-file'] = '/var/lib/torque/server_priv/nodes' core.config['torque.pbs-servername-file'] = '/var/lib/torque/server_name' core.skip_ok_unless_installed(*self.required_rpms) if os.path.exists(core.config['torque.pbs-lockfile']): core.state['torque.pbs-server-running'] = True self.skip_ok('pbs server apparently running') # set hostname as servername instead of localhost files.write(core.config['torque.pbs-servername-file'], "%s" % core.get_hostname(), owner='pbs') core.state['torque.pbs-configured'] = True # trqauthd is required for the pbs_server service.start('trqauthd') if not os.path.exists('/var/lib/torque/server_priv/serverdb'): if core.el_release() <= 6: command = 'service pbs_server create' # this creates the default config and starts the service else: # XXX: "service pbs_server create" doesn't work for systemd, and I haven't found a # systemd equivalent to do the "create" step in el7 ... The following was # distilled from the el6 init.d script: (but please correct as necessary) command = ('/usr/sbin/pbs_server -d /var/lib/torque -t create -f && ' 'sleep 10 && /usr/bin/qterm') stdout, _, fail = core.check_system(command, 'create initial pbs serverdb config', shell=True) self.assert_(stdout.find('error') == -1, fail) # This gets wiped if we write it before the initial 'service pbs_server create' # However, this file needs to be in place before the service is started so we # restart the service after 'initial configuration' files.write(core.config['torque.pbs-nodes-file'], # add the local node as a compute node "%s np=1 num_node_boards=1\n" % core.get_hostname(), owner='pbs') # Sometimes the restart command throws an error on stop but still manages # to kill the service, meaning that the service doesn't get brought back up command = ('service', 'pbs_server', 'stop') core.system(command, 'stop pbs server daemon') server_log = '/var/log/torque/server_logs/' + date.today().strftime('%Y%m%d') try: server_log_stat = os.stat(server_log) except OSError: server_log_stat = None command = ('service', 'pbs_server', 'start') stdout, _, fail = core.check_system(command, 'Start pbs server daemon') self.assert_(stdout.find('error') == -1, fail) self.assert_(os.path.exists(core.config['torque.pbs-lockfile']), 'pbs server run lock file missing') core.state['torque.pbs-server-started'] = True core.state['torque.pbs-server-running'] = True # Wait until the server is up before writing the rest of the config core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0) core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()), "Configuring pbs server", shell=True) # wait up to 5 minutes for the server to recognize the node start_time = time.time() while (time.time() - start_time) < 600: command = ('/usr/bin/qnodes', '-s', core.get_hostname()) stdout, _, fail = core.check_system(command, 'Get pbs node info') self.assert_(stdout.find('error') == -1, fail) if stdout.find('state = free'): core.state['torque.nodes-up'] = True break if not core.state['torque.nodes-up']: self.fail('PBS nodes not coming up')