def test_05_start_pbs(self): core.state['pbs_server.started-service'] = False core.state['torque.nodes-up'] = False core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_bad_unless(service.is_running('trqauthd'), 'pbs_server requires trqauthd') self.skip_ok_if(service.is_running('pbs_server'), 'pbs server already running') server_log = '/var/log/torque/server_logs/' + date.today().strftime('%Y%m%d') try: server_log_stat = os.stat(server_log) except OSError: server_log_stat = None service.check_start('pbs_server') # Wait until the server is up before writing the rest of the config core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0) core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()), "Configuring pbs server", shell=True) # wait up to 5 minutes for the server to recognize the node start_time = time.time() while (time.time() - start_time) < 600: command = ('/usr/bin/qnodes', '-s', core.get_hostname()) stdout, _, fail = core.check_system(command, 'Get pbs node info') self.assert_(stdout.find('error') == -1, fail) if stdout.find('state = free'): core.state['torque.nodes-up'] = True break if not core.state['torque.nodes-up']: self.fail('PBS nodes not coming up')
def test_01_create_macaroons(self): core.config['xrootd.tpc.macaroon-1'] = None core.config['xrootd.tpc.macaroon-2'] = None core.skip_ok_unless_installed('x509-scitokens-issuer-client', by_dependency=True) self.skip_bad_unless(core.state['proxy.valid'], 'requires a proxy cert') uid = pwd.getpwnam(core.options.username)[2] usercert = '/tmp/x509up_u%d' % uid userkey = '/tmp/x509up_u%d' % uid core.config['xrootd.tpc.url-1'] = "https://" + core.get_hostname() + ":9001" + "/usr/share/osg-test/test_gridftp_data.txt".strip() command = ('macaroon-init', core.config['xrootd.tpc.url-1'], '20', 'ALL') status, stdout, stderr = core.system(command, user=True) fail = core.diagnose('Obtain Macaroon 1', command, status, stdout, stderr) self.assertEqual(status, 0, fail) core.config['xrootd.tpc.macaroon-1'] = stdout.strip() core.config['xrootd.tpc.url-2'] = "https://" + core.get_hostname() + ":9002" + "/tmp/test_gridftp_data_tpc.txt".strip() command = ('macaroon-init', core.config['xrootd.tpc.url-2'], '20', 'ALL') status, stdout, stderr = core.system(command, user=True) fail = core.diagnose('Obtain Macaroon 2', command, status, stdout, stderr) self.assertEqual(status, 0, fail) core.config['xrootd.tpc.macaroon-2'] = stdout.strip()
def test_04_start_pbs(self): core.config['torque.pbs-lockfile'] = '/var/lock/subsys/pbs_server' core.state['torque.pbs-server-running'] = False core.state['torque.pbs-configured'] = False core.state['torque.nodes-up'] = False if core.el_release() == 5: core.config[ 'torque.pbs-nodes-file'] = '/var/torque/server_priv/nodes' elif core.el_release() == 6: core.config[ 'torque.pbs-nodes-file'] = '/var/lib/torque/server_priv/nodes' else: core.skip('Distribution version not supported') if core.missing_rpm(*self.required_rpms): return if os.path.exists(core.config['torque.pbs-lockfile']): core.skip('pbs server apparently running') return # add the local node as a compute node files.write(core.config['torque.pbs-nodes-file'], "%s np=1\n" % core.get_hostname(), owner='pbs') command = ('service', 'pbs_server', 'start') stdout, _, fail = core.check_system(command, 'Start pbs server daemon') self.assert_(stdout.find('error') == -1, fail) self.assert_(os.path.exists(core.config['torque.pbs-lockfile']), 'pbs server run lock file missing') core.state['torque.pbs-server'] = True core.state['torque.pbs-server-running'] = True core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()), "Configuring pbs server", shell=True) core.state['torque.pbs-configured'] = True # wait up to 5 minutes for the server to come up and trigger a failure # if that doesn't happen start_time = time.time() while ((time.time() - start_time) < 600): command = ('/usr/bin/qnodes', '-s', core.get_hostname()) stdout, _, fail = core.check_system(command, 'Get pbs node info') self.assert_(stdout.find('error') == -1, fail) if stdout.find('state = free'): core.state['torque.nodes-up'] = True break if not core.state['torque.nodes-up']: self.fail('PBS nodes not coming up')
def test_08_https_fetch_from_auth_cache(self): core.skip_ok_unless_installed('globus-proxy-utils', 'gfal2-plugin-http', 'gfal2-util', 'gfal2-plugin-file', by_dependency=True) self.skip_bad_unless(core.state['proxy.valid'], 'requires a proxy cert') name, contents = self.testfiles[3] path = os.path.join(getcfg("OriginAuthExport"), name) dest_file = '/tmp/testfileHTTPsFromAuthCache' uid = pwd.getpwnam(core.options.username)[2] usercert = '/tmp/x509up_u%d' % uid userkey = '/tmp/x509up_u%d' % uid result, _, _ = \ core.check_system(["gfal-copy", "-vf", "--cert", usercert, "--key", userkey, "https://%s:%d%s" % (core.get_hostname(),getcfg("CacheHTTPSPort"), path), "file://%s"%dest_file], "Checking xrootd copy from Authenticated cache", user=True) origin_file = os.path.join(getcfg("OriginRootdir"), getcfg("OriginAuthExport").lstrip("/"), name) checksum_match = files.checksum_files_match(origin_file, dest_file) self.assert_( checksum_match, 'Origin and file downloaded via cache have the same contents')
def test_01_gratia_admin_webpage (self): core.skip_ok_unless_installed('gratia-service') self.skip_bad_unless(core.state['tomcat.started'], 'Tomcat not started') host = core.get_hostname() admin_webpage = 'http://' + host + ':8880/gratia-administration/status.html?wantDetails=0' command = ('curl', admin_webpage) core.check_system(command, 'Unable to launch gratia admin webpage')
def test_02_start_mom(self): core.config['torque.mom-lockfile'] = '/var/lock/subsys/pbs_mom' core.state['torque.pbs-mom-running'] = False if core.missing_rpm(*self.required_rpms): return if os.path.exists(core.config['torque.mom-lockfile']): core.skip('pbs mom apparently running') return if core.el_release() == 5: core.config['torque.mom-config'] = '/var/torque/mom_priv/config' elif core.el_release() == 6: core.config[ 'torque.mom-config'] = '/var/lib/torque/mom_priv/config' else: core.skip('Distribution version not supported') files.write(core.config['torque.mom-config'], "$pbsserver %s\n" % core.get_hostname(), owner='pbs') command = ('service', 'pbs_mom', 'start') stdout, _, fail = core.check_system(command, 'Start pbs mom daemon') self.assert_(stdout.find('error') == -1, fail) self.assert_(os.path.exists(core.config['torque.mom-lockfile']), 'PBS mom run lock file missing') core.state['torque.pbs-mom-running'] = True
def test_02_condor_ce_run_condor(self): core.skip_ok_unless_installed('htcondor-ce', 'htcondor-ce-client', 'htcondor-ce-condor', 'condor') self.skip_bad_unless(service.is_running('condor-ce'), 'ce not running') self.skip_bad_unless(service.is_running('condor'), 'condor not running') self.skip_bad_unless(core.state['jobs.env-set'], 'job environment not set') token_file = core.config['token.condor_write'] self.skip_bad_unless( core.state['proxy.valid'] or os.path.exists(token_file), 'requires a scitoken or a proxy') command = [ 'condor_ce_run', '--debug', '-r', '%s:9619' % core.get_hostname(), '/bin/env' ] if os.path.exists(token_file): # FIXME: After HTCONDOR-636 is released (targeted for HTCondor-CE 5.1.2), # we can stop setting _condor_SCITOKENS_FILE for token_var in ('_condor_SCITOKENS_FILE', 'BEARER_TOKEN_FILE'): os.environ[token_var] = token_file else: core.log_message( 'condor WRITE token not found; skipping SCITOKENS auth') if core.osg_release() == "3.6" and \ core.PackageVersion('condor') >= '9.0.0' and \ core.PackageVersion('condor') < '9.0.8': with core.no_x509(core.options.username): self.run_job_in_tmp_dir(command, 'condor_ce_run a Condor job') else: self.run_job_in_tmp_dir(command, 'condor_ce_run a Condor job')
def test_01_request_condor_write_scitoken(self): core.state['token.condor_write_created'] = False core.config['token.condor_write'] = '/tmp/condor_write.scitoken' core.skip_ok_unless_installed('htcondor-ce', 'condor') self.skip_ok_if( core.PackageVersion('condor') <= '8.9.4', 'HTCondor version does not support SciToken submission') self.skip_ok_if(os.path.exists(core.config['token.condor_write']), 'SciToken with HTCondor WRITE already exists') hostname = core.get_hostname() try: token = request_demo_scitoken('condor:/READ condor:/WRITE', audience=f'{hostname}:9619') except error.URLError as exc: self.fail( f"Failed to request token from demo.scitokens.org:\n{exc}") ids = (0, 0) if core.state['user.verified']: user = pwd.getpwnam(core.options.username) ids = (user.pw_uid, user.pw_gid) files.write(core.config['token.condor_write'], core.to_str(token), backup=False, chown=ids) core.state['token.condor_write_created'] = True
def test_03_config_parameters(self): core.skip_ok_unless_installed('gratia-service') core.config['gratia.host'] = core.get_hostname() core.config['gratia.config.dir'] = '/etc/gratia' # The name of the gratia directory changed gratia_version = core.get_package_envra('gratia-service')[2] gratia_version_split = gratia_version.split('.') if self.tuple_cmp(gratia_version_split, ['1', '13', '5']) < 0: core.config['gratia.directory'] = "collector" else: core.config['gratia.directory'] = "services" core.config['certs.httpcert'] = '/etc/grid-security/http/httpcert.pem' core.config['certs.httpkey'] = '/etc/grid-security/http/httpkey.pem' filename = "/tmp/gratia_reader_pass." + str(os.getpid()) + ".txt" contents = "[client]\n" + "password=reader\n" files.write(filename, contents, backup=False) core.config['gratia.sql.file'] = filename core.config['gratia.sql.querystring'] = "\" | mysql --defaults-extra-file=\"" + core.config['gratia.sql.file'] + "\" --skip-column-names -B --unbuffered --user=reader --port=3306" core.config['gratia.tmpdir.prefix'] = "/var/lib/gratia/tmp/gratiafiles/" core.config['gratia.tmpdir.postfix'] = "_" + core.config['gratia.host'] + "_" + core.config['gratia.host'] + "_8880" core.config['gratia.log.file'] = "/var/log/gratia-service/gratia.log" core.state['gratia.log.stat'] = None
def test_04_configure_pbs(self): core.config[ 'torque.pbs-nodes-file'] = '/var/lib/torque/server_priv/nodes' core.config[ 'torque.pbs-serverdb'] = '/var/lib/torque/server_priv/serverdb' core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_bad_unless(service.is_running('trqauthd'), 'pbs_server requires trqauthd') self.skip_ok_if(service.is_running('pbs_server'), 'pbs server already running') files.preserve(core.config['torque.pbs-serverdb'], 'pbs') if not os.path.exists(core.config['torque.pbs-serverdb']): command = ( '/usr/sbin/pbs_server -d /var/lib/torque -t create -f && ' 'sleep 10 && /usr/bin/qterm') stdout, _, fail = core.check_system( command, 'create initial pbs serverdb config', shell=True) self.assert_(stdout.find('error') == -1, fail) # This gets wiped if we write it before the initial 'service pbs_server create' # However, this file needs to be in place before the service is started so we # restart the service after 'initial configuration' files.write( core.config[ 'torque.pbs-nodes-file'], # add the local node as a compute node "%s np=1 num_node_boards=1\n" % core.get_hostname(), owner='pbs')
def test_02_condor_ce_run_condor(self): core.skip_ok_unless_installed('htcondor-ce', 'htcondor-ce-client', 'htcondor-ce-condor', 'condor') self.skip_bad_unless(service.is_running('condor-ce'), 'ce not running') self.skip_bad_unless(service.is_running('condor'), 'condor not running') self.skip_bad_unless(core.state['jobs.env-set'], 'job environment not set') self.skip_bad_unless( core.state['proxy.valid'] or core.state['token.condor_write_created'], 'requires a scitoken or a proxy') command = [ 'condor_ce_run', '--debug', '-r', '%s:9619' % core.get_hostname(), '/bin/env' ] if core.state['token.condor_write_created']: # FIXME: After HTCONDOR-636 is released (targeted for HTCondor-CE 5.1.2), # we can stop setting _condor_SCITOKENS_FILE for token_var in ('_condor_SCITOKENS_FILE', 'BEARER_TOKEN_FILE'): os.environ[token_var] = core.config['token.condor_write'] else: core.log_message( 'condor WRITE token not found; skipping SCITOKENS auth') self.run_job_in_tmp_dir(command, 'condor_ce_run a Condor job')
def run_trace(self, *args): """Run condor_ce_trace along with any additional *args. If trace completes with a held job, also return output from 'condor_ce_q -held'. """ cwd = os.getcwd() os.chdir('/tmp') self.command += ['condor_ce_trace', '--debug' ] + list(args) + [core.get_hostname()] if core.osg_release() == "3.6" and \ core.PackageVersion('condor') >= '9.0.0' and \ core.PackageVersion('condor') < '9.0.8': with core.no_x509(core.options.username): trace_rc, trace_out, trace_err = core.system(self.command, user=True) else: trace_rc, trace_out, trace_err = core.system(self.command, user=True) os.chdir(cwd) if trace_rc: msg = 'condor_ce_trace failed' if trace_out.find(', was held'): msg = 'condor_ce_trace job held' _, hold_out, hold_err = core.system(('condor_ce_q', '-held')) self.fail( core.diagnose(msg, self.command, trace_rc, str(trace_out) + str(hold_out), str(trace_err) + str(hold_err))) return trace_out, trace_err
def modify_probeconfig(self, probeconfig): """This helper method modifies the Probe Configuration, generally needed by many probes""" #Backup the existing ProbeConfig, before any modification, so that it can be restored later #Note that "owner" has to be a unique string since "ProbeConfig" filename is the same for all probes #If ProbeConfig path is: /etc/gratia/gridftp-transfer/ProbeConfig, "owner" = "gridftp-transfer" owner = os.path.basename(os.path.dirname(probeconfig)) files.preserve(probeconfig, owner) host = core.get_hostname() collectorhost = " CollectorHost=\"" + host + ":8880\"" sslhost = " SSLHost=\"" + host + ":8443\"" sslregistrationhost = " SSLRegistrationHost=\"" + host + ":8880\"" self.patternreplace(probeconfig, "CollectorHost", collectorhost) self.patternreplace(probeconfig, "SSLHost", sslhost) self.patternreplace(probeconfig, "SSLRegistrationHost", sslregistrationhost) self.patternreplace(probeconfig, "SiteName", "SiteName=\"OSG Test site\"") self.patternreplace(probeconfig, "EnableProbe", "EnableProbe=\"1\"") #If a line with QuarantineUnknownVORecords pattern is not found, insert it after QuarantineSize line if self.patternreplace(probeconfig, "QuarantineUnknownVORecords=", "QuarantineUnknownVORecords=\"0\"") == False: self.patternreplace(probeconfig, "QuarantineSize=", "QuarantineUnknownVORecords=\"0\"", insert_after=True)
def test_02_start_mom(self): if core.el_release() <= 6: core.config['torque.mom-lockfile'] = '/var/lock/subsys/pbs_mom' else: core.config['torque.mom-lockfile'] = '/var/lib/torque/mom_priv/mom.lock' core.state['torque.pbs-mom-running'] = False core.skip_ok_unless_installed(*self.required_rpms) self.skip_ok_if(os.path.exists(core.config['torque.mom-lockfile']), 'pbs mom apparently running') core.config['torque.mom-config'] = '/var/lib/torque/mom_priv/config' files.write(core.config['torque.mom-config'], "$pbsserver %s\n" % core.get_hostname(), owner='pbs') core.config['torque.mom-layout'] = '/var/lib/torque/mom_priv/mom.layout' files.write(core.config['torque.mom-layout'], "nodes=0", owner='pbs') command = ('service', 'pbs_mom', 'start') stdout, _, fail = core.check_system(command, 'Start pbs mom daemon') self.assert_(stdout.find('error') == -1, fail) self.assert_(os.path.exists(core.config['torque.mom-lockfile']), 'PBS mom run lock file missing') core.state['torque.pbs-mom-running'] = True
def test_01_gratia_admin_webpage(self): core.skip_ok_unless_installed('gratia-service') self.skip_bad_unless(core.state['tomcat.started'], 'Tomcat not started') host = core.get_hostname() admin_webpage = 'http://' + host + ':8880/gratia-administration/status.html?wantDetails=0' command = ('curl', admin_webpage) core.check_system(command, 'Unable to launch gratia admin webpage')
def test_05_my_proxy_retrieval(self): core.skip_ok_unless_installed('myproxy', 'myproxy-server') self.skip_bad_unless(core.state['myproxy.started-server'], 'MyProxy server failed to start') self.skip_bad_unless(core.state['myproxy.created'], 'MyProxy creation failed') command = ('myproxy-logon', '--verbose', '-s', core.get_hostname(), '-l', core.options.username) password = core.config['myproxy.password'] + '\n' core.check_system(command, 'myproxy-logon retrieval', user=True, stdin=password)
def test_05_condor_ce_run_condor(self): core.skip_ok_unless_installed('htcondor-ce', 'htcondor-ce-client', 'htcondor-ce-condor', 'condor') self.skip_bad_unless('condor-ce.started', 'ce not started') self.skip_bad_unless(core.state['jobs.env-set'], 'job environment not set') command = ('condor_ce_run', '-r', '%s:9619' % core.get_hostname(), '/bin/env') self.run_job_in_tmp_dir(command, 'condor_ce_run a Condor job')
def test_06_config_misc_file(self): core.skip_ok_unless_installed('osg-info-services') core.skip_ok_unless_one_installed(*self.possible_rpms) core.config['osg-info-services.misc-file'] = '/etc/osg/config.d/10-misc.ini' files.replace(core.config['osg-info-services.misc-file'], 'gums_host = DEFAULT', 'gums_host = ' + core.get_hostname(), owner='root')
def test_01_remove_proxy(self): core.skip_ok_unless_installed('myproxy', 'myproxy-server') self.skip_ok_unless(core.state['myproxy.started-server'], 'MyProxy server failed to start') # If there is no pre-existing proxy file, the following command will # produce error output and have exit status 1; because this is the # expected (but not the only valid) case, do not check the output or # exit status. This test exists only to clear out a pre-existing proxy. command = ('myproxy-destroy', '--verbose', '-s', core.get_hostname(), '-l', core.options.username) core.system(command, user=True)
def test_03_configure_ce(self): core.skip_ok_unless_installed('condor', 'htcondor-ce', 'htcondor-ce-client') # Set up Condor, PBS, and Slurm routes # Leave the GRIDMAP knob in tact to verify that it works with the LCMAPS VOMS plugin core.config['condor-ce.condor-ce-cfg'] = '/etc/condor-ce/config.d/99-osgtest.condor-ce.conf' # Add host DN to condor_mapfile if core.options.hostcert: core.config['condor-ce.condorce_mapfile'] = '/etc/condor-ce/condor_mapfile.osg-test' hostcert_dn, _ = cagen.certificate_info(core.config['certs.hostcert']) mapfile_contents = files.read('/etc/condor-ce/condor_mapfile') mapfile_contents.insert(0, re.sub(r'([/=\.])', r'\\\1', "GSI \"^%s$\" " % hostcert_dn) + \ "%[email protected]\n" % core.get_hostname()) files.write(core.config['condor-ce.condorce_mapfile'], mapfile_contents, owner='condor-ce', chmod=0o644) else: core.config['condor-ce.condorce_mapfile'] = '/etc/condor-ce/condor_mapfile' condor_contents = """GRIDMAP = /etc/grid-security/grid-mapfile CERTIFICATE_MAPFILE = %s ALL_DEBUG=D_FULLDEBUG JOB_ROUTER_DEFAULTS = $(JOB_ROUTER_DEFAULTS) [set_default_maxMemory = 128;] JOB_ROUTER_ENTRIES = \\ [ \\ GridResource = "batch pbs"; \\ TargetUniverse = 9; \\ name = "Local_PBS"; \\ Requirements = target.osgTestBatchSystem =?= "pbs"; \\ ] \\ [ \\ GridResource = "batch slurm"; \\ TargetUniverse = 9; \\ name = "Local_Slurm"; \\ Requirements = target.osgTestBatchSystem =?= "slurm"; \\ ] \\ [ \\ TargetUniverse = 5; \\ name = "Local_Condor"; \\ Requirements = (target.osgTestBatchSystem =!= "pbs" && target.osgTestBatchSystem =!= "slurm"); \\ ] JOB_ROUTER_SCHEDD2_SPOOL=/var/lib/condor/spool JOB_ROUTER_SCHEDD2_NAME=$(FULL_HOSTNAME) JOB_ROUTER_SCHEDD2_POOL=$(FULL_HOSTNAME):9618 """ % core.config['condor-ce.condorce_mapfile'] if core.rpm_is_installed('htcondor-ce-view'): condor_contents += "\nDAEMON_LIST = $(DAEMON_LIST), CEVIEW, GANGLIAD, SCHEDD" core.config['condor-ce.view-port'] = condor.ce_config_val('HTCONDORCE_VIEW_PORT') files.write(core.config['condor-ce.condor-ce-cfg'], condor_contents, owner='condor-ce', chmod=0o644)
def advertise_vomses(vo, hostcert='/etc/grid-security/hostcert.pem'): """Edit /etc/vomses to advertise the current host as the VOMS server for the given VO. Caller is responsible for preserving and restoring /etc/vomses. """ host_dn, _ = cagen.certificate_info(hostcert) hostname = core.get_hostname() vomses_path = '/etc/vomses' contents = ('"%s" "%s" "%d" "%s" "%s"\n' % (vo, hostname, VOPORT, host_dn, vo)) files.write(vomses_path, contents, backup=False, chmod=0o644)
def advertise_lsc(vo, hostcert='/etc/grid-security/hostcert.pem'): """Create the VO directory and .lsc file under /etc/grid-security/vomsdir for the given VO""" host_dn, host_issuer = cagen.certificate_info(hostcert) hostname = core.get_hostname() lsc_dir = os.path.join('/etc/grid-security/vomsdir', vo) if not os.path.isdir(lsc_dir): os.makedirs(lsc_dir) vo_lsc_path = os.path.join(lsc_dir, hostname + '.lsc') files.write(vo_lsc_path, (host_dn + '\n', host_issuer + '\n'), backup=False, chmod=0o644)
def test_07_ceview(self): core.config['condor-ce.view-listening'] = False self.general_requirements() core.skip_ok_unless_installed('htcondor-ce-view') view_url = 'http://%s:%s' % (core.get_hostname(), int(core.config['condor-ce.view-port'])) try: src = urllib2.urlopen(view_url).read() except urllib2.URLError: self.fail('Could not reach HTCondor-CE View at %s' % view_url) self.assert_(re.search(r'HTCondor-CE Overview', src), 'Failed to find expected CE View contents') core.config['condor-ce.view-listening'] = True
def test_08_ceview(self): core.config['condor-ce.view-listening'] = False self.general_requirements() core.skip_ok_unless_installed('htcondor-ce-view') view_url = 'http://%s:%s' % (core.get_hostname(), int(core.config['condor-ce.view-port'])) try: src = urllib2.urlopen(view_url).read() except urllib2.URLError: self.fail('Could not reach HTCondor-CE View at %s' % view_url) self.assert_(re.search(r'HTCondor-CE Overview', src), 'Failed to find expected CE View contents') core.config['condor-ce.view-listening'] = True
def test_04_trace(self): self.general_requirements() self.skip_bad_unless(core.state['condor-ce.schedd-ready'], 'CE schedd not ready to accept jobs') cwd = os.getcwd() os.chdir('/tmp') command = ('condor_ce_trace', '--debug', core.get_hostname()) core.check_system(command, 'ce trace', user=True) os.chdir(cwd)
def test_03_start_trqauthd(self): core.state['trqauthd.started-service'] = False core.config['torque.pbs-servername-file'] = '/var/lib/torque/server_name' core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_ok_if(service.is_running('trqauthd'), 'trqauthd is already running') # set hostname as servername instead of localhost # config required before starting trqauthd files.write(core.config['torque.pbs-servername-file'], "%s" % core.get_hostname(), owner='pbs') service.check_start('trqauthd')
def test_03_start_trqauthd(self): core.state['trqauthd.started-service'] = False core.config[ 'torque.pbs-servername-file'] = '/var/lib/torque/server_name' core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_ok_if(service.is_running('trqauthd'), 'trqauthd is already running') # set hostname as servername instead of localhost # config required before starting trqauthd files.write(core.config['torque.pbs-servername-file'], "%s" % core.get_hostname(), owner='pbs') service.check_start('trqauthd')
def test_01_create_macaroons(self): core.skip_ok_unless_installed('xrootd', 'xrootd-scitokens', 'x509-scitokens-issuer-client', by_dependency=True) self.skip_bad_unless(core.state['proxy.created'], 'Proxy creation failed') uid = pwd.getpwnam(core.options.username)[2] usercert = '/tmp/x509up_u%d' % uid userkey = '/tmp/x509up_u%d' % uid core.config['xrootd.tpc.url-1'] = "https://" + core.get_hostname() + ":9001" + "/usr/share/osg-test/test_gridftp_data.txt".strip() command = ('macaroon-init', core.config['xrootd.tpc.url-1'], '20', 'DOWNLOAD') status, stdout, stderr = core.system(command, user=True) fail = core.diagnose('Obtain Macaroon one', command, status, stdout, stderr) core.config['xrootd.tpc.macaroon-1'] = stdout.strip() core.config['xrootd.tpc.url-2'] = "https://" + core.get_hostname() + ":9002" + "/tmp/test_gridftp_data_tpc.txt".strip() command = ('macaroon-init', core.config['xrootd.tpc.url-2'], '20', 'UPLOAD') status, stdout, stderr = core.system(command, user=True) fail = core.diagnose('Obtain Macaroon number two', command, status, stdout, stderr) core.config['xrootd.tpc.macaroon-2'] = stdout.strip()
def test_02_condor_ce_run_condor(self): core.skip_ok_unless_installed('htcondor-ce', 'htcondor-ce-client', 'htcondor-ce-condor', 'condor') self.skip_bad_unless(service.is_running('condor-ce'), 'ce not running') self.skip_bad_unless(service.is_running('condor'), 'condor not running') self.skip_bad_unless(core.state['jobs.env-set'], 'job environment not set') command = ('condor_ce_run', '-r', '%s:9619' % core.get_hostname(), '/bin/env') self.run_job_in_tmp_dir(command, 'condor_ce_run a Condor job')
def test_05_start_pbs(self): core.state['pbs_server.started-service'] = False core.state['torque.nodes-up'] = False core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_bad_unless(service.is_running('trqauthd'), 'pbs_server requires trqauthd') self.skip_ok_if(service.is_running('pbs_server'), 'pbs server already running') server_log = '/var/log/torque/server_logs/' + date.today().strftime( '%Y%m%d') try: server_log_stat = os.stat(server_log) except OSError: server_log_stat = None service.check_start('pbs_server') # Wait until the server is up before writing the rest of the config core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0) core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()), "Configuring pbs server", shell=True) # wait up to 5 minutes for the server to recognize the node start_time = time.time() while (time.time() - start_time) < 600: command = ('/usr/bin/qnodes', '-s', core.get_hostname()) stdout, _, fail = core.check_system(command, 'Get pbs node info') self.assert_(stdout.find('error') == -1, fail) if stdout.find('state = free'): core.state['torque.nodes-up'] = True break if not core.state['torque.nodes-up']: self.fail('PBS nodes not coming up')
def test_04_myproxy_init(self): core.skip_ok_unless_installed('myproxy', 'myproxy-server') self.skip_bad_unless(core.state['myproxy.started-server'], 'MyProxy server failed to start') core.state['myproxy.created'] = False core.config['myproxy.password'] = '******' core.skip_ok_unless_installed('myproxy', 'myproxy-server') # The -S option is given in the command so it accepts the stdin input for the passowrds command = ('myproxy-init', '--verbose', '-C', core.state['proxy.path'], '-y', core.state['proxy.path'], '-s', core.get_hostname(), '-S', '-l', core.options.username) # We give an already created proxy to my proxy and password to store it password = core.config['myproxy.password'] core.check_system(command, 'Normal myproxy-init', user=True, stdin=password) core.state['myproxy.created'] = True
def test_01_start_mom(self): core.state['pbs_mom.started-service'] = False core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_ok_if(service.is_running('pbs_mom'), 'PBS mom already running') core.config['torque.mom-config'] = '/var/lib/torque/mom_priv/config' files.write(core.config['torque.mom-config'], "$pbsserver %s\n" % core.get_hostname(), owner='pbs') core.config[ 'torque.mom-layout'] = '/var/lib/torque/mom_priv/mom.layout' files.write(core.config['torque.mom-layout'], "nodes=0", owner='pbs') service.check_start('pbs_mom')
def test_01_start_mom(self): core.state['pbs_mom.started-service'] = False core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_ok_if(service.is_running('pbs_mom'), 'PBS mom already running') core.config['torque.mom-config'] = '/var/lib/torque/mom_priv/config' files.write(core.config['torque.mom-config'], "$pbsserver %s\n" % core.get_hostname(), owner='pbs') core.config['torque.mom-layout'] = '/var/lib/torque/mom_priv/mom.layout' files.write(core.config['torque.mom-layout'], "nodes=0", owner='pbs') service.check_start('pbs_mom')
def test_03_configure_globus_pbs(self): core.config['globus.pbs-config'] = '/etc/globus/globus-pbs.conf' core.state['globus.pbs_configured'] = False core.skip_ok_unless_installed('globus-gram-job-manager-pbs') config_file = file(core.config['globus.pbs-config']).read() server_name = core.get_hostname() re_obj = re.compile('^pbs_default=.*$', re.MULTILINE) if 'pbs_default' in config_file: config_file = re_obj.sub("pbs_default=\"%s\"" % server_name, config_file) else: config_file += "pbs_default=\"%s\"" % server_name files.write(core.config['globus.pbs-config'], config_file, owner='pbs') core.state['globus.pbs_configured'] = True
def test_03_configure_globus_pbs(self): core.config['globus.pbs-config'] = '/etc/globus/globus-pbs.conf' core.state['globus.pbs_configured'] = False if not core.rpm_is_installed('globus-gram-job-manager-pbs'): return config_file = file(core.config['globus.pbs-config']).read() server_name = core.get_hostname() re_obj = re.compile('^pbs_default=.*$', re.MULTILINE) if 'pbs_default' in config_file: config_file = re_obj.sub("pbs_default=\"%s\"" % server_name, config_file) else: config_file += "pbs_default=\"%s\"" % server_name files.write(core.config['globus.pbs-config'], config_file, 'pbs') core.state['globus.pbs_configured'] = True
def test_07_ceview(self): core.config['condor-ce.view-listening'] = False core.skip_ok_unless_installed('htcondor-ce-view') view_url = 'http://%s:%s' % (core.get_hostname(), int(core.config['condor-ce.view-port'])) try: src = core.to_str(urlopen(view_url).read()) core.log_message(src) except EnvironmentError as err: debug_file = '/var/log/condor-ce/CEViewLog' debug_contents = 'Contents of %s\n%s\n' % (debug_file, '=' * 20) try: debug_contents += files.read(debug_file, True) except EnvironmentError: debug_contents += 'Failed to read %s\n' % debug_file core.log_message(debug_contents) self.fail('Could not reach HTCondor-CE View at %s: %s' % (view_url, err)) self.assertTrue(re.search(r'HTCondor-CE Overview', src), 'Failed to find expected CE View contents') core.config['condor-ce.view-listening'] = True
def test_07_xrootd_fetch_from_auth_cache(self): core.skip_ok_unless_installed('globus-proxy-utils', by_dependency=True) self.skip_bad_unless(core.state['proxy.valid'], 'requires a proxy cert') name, contents = self.testfiles[2] path = os.path.join(getcfg("OriginAuthExport"), name) os.environ["XrdSecGSISRVNAMES"] = "*" dest_file = '/tmp/testfileXrootdFromAuthCache' result, _, _ = \ core.check_system(["xrdcp", "-d1","-f", "root://%s:%d/%s" % (core.get_hostname(),getcfg("CacheHTTPSPort"), path), dest_file], "Checking xrootd copy from Authenticated cache", user=True) origin_file = os.path.join(getcfg("OriginRootdir"), getcfg("OriginAuthExport").lstrip("/"), name) checksum_match = files.checksum_files_match(origin_file, dest_file) self.assert_( checksum_match, 'Origin and file downloaded via cache have the same contents')
def test_04_configure_pbs(self): core.config['torque.pbs-nodes-file'] = '/var/lib/torque/server_priv/nodes' core.config['torque.pbs-serverdb'] = '/var/lib/torque/server_priv/serverdb' core.skip_ok_unless_installed(*self.required_rpms, by_dependency=True) self.skip_bad_unless(service.is_running('trqauthd'), 'pbs_server requires trqauthd') self.skip_ok_if(service.is_running('pbs_server'), 'pbs server already running') files.preserve(core.config['torque.pbs-serverdb'], 'pbs') if not os.path.exists(core.config['torque.pbs-serverdb']): command = ('/usr/sbin/pbs_server -d /var/lib/torque -t create -f && ' 'sleep 10 && /usr/bin/qterm') stdout, _, fail = core.check_system(command, 'create initial pbs serverdb config', shell=True) self.assert_(stdout.find('error') == -1, fail) # This gets wiped if we write it before the initial 'service pbs_server create' # However, this file needs to be in place before the service is started so we # restart the service after 'initial configuration' files.write(core.config['torque.pbs-nodes-file'], # add the local node as a compute node "%s np=1 num_node_boards=1\n" % core.get_hostname(), owner='pbs')
def test_08_config_site_info_file(self): core.skip_ok_unless_installed('osg-info-services') core.skip_ok_unless_one_installed(*self.possible_rpms) core.config['osg-info-services.siteinfo-file'] = '/etc/osg/config.d/40-siteinfo.ini' files.replace(core.config['osg-info-services.siteinfo-file'], 'group = OSG', 'group = OSG-ITB', owner='root') files.replace_regexpr(core.config['osg-info-services.siteinfo-file'], 'host_name = *', 'host_name = ' + core.get_hostname(), backup=False) files.replace(core.config['osg-info-services.siteinfo-file'], 'sponsor = UNAVAILABLE', 'sponsor = mis:100', backup=False) files.replace(core.config['osg-info-services.siteinfo-file'], 'contact = UNAVAILABLE', 'contact = Lando Calrissian', backup=False) files.replace(core.config['osg-info-services.siteinfo-file'], 'email = UNAVAILABLE', 'email = [email protected]', backup=False) files.replace(core.config['osg-info-services.siteinfo-file'], 'city = UNAVAILABLE', 'city = Cloud City', backup=False) files.replace(core.config['osg-info-services.siteinfo-file'], 'country = UNAVAILABLE', 'country = Bespin', backup=False) files.replace_regexpr(core.config['osg-info-services.siteinfo-file'], 'longitude =*', 'longitude = -1', backup=False) files.replace(core.config['osg-info-services.siteinfo-file'], 'latitude = UNAVAILABLE', 'latitude = 45', backup=False)
def run_trace(self, *args): """Run condor_ce_trace along with any additional *args. If trace completes with a held job, also return output from 'condor_ce_q -held'. """ cwd = os.getcwd() os.chdir('/tmp') self.command += ['condor_ce_trace', '--debug'] + list(args) + [core.get_hostname()] trace_rc, trace_out, trace_err = core.system(self.command, user=True) os.chdir(cwd) if trace_rc: msg = 'condor_ce_trace failed' if trace_out.find(', was held'): msg = 'condor_ce_trace job held' _, hold_out, hold_err = core.system(('condor_ce_q', '-held')) self.fail(core.diagnose(msg, self.command, trace_rc, str(trace_out) + str(hold_out), str(trace_err) + str(hold_err))) return trace_out, trace_err
def run_blahp_trace(self, lrms): """Run condor_ce_trace() against a non-HTCondor backend and verify the cache""" lrms_cache_prefix = {'pbs': 'qstat', 'slurm': 'slurm'} cwd = os.getcwd() os.chdir('/tmp') command = ('condor_ce_trace', '-a osgTestBatchSystem = %s' % lrms.lower(), '--debug', core.get_hostname()) trace_out, _, _ = core.check_system(command, 'ce trace against %s' % lrms.lower(), user=True) try: backend_jobid = re.search(r'%s_JOBID=(\d+)' % lrms.upper(), trace_out).group(1) except AttributeError: # failed to find backend job ID self.fail('did not run against %s' % lrms.upper()) cache_file = '/var/tmp/%s_cache_%s/blahp_results_cache' % (lrms_cache_prefix[lrms.lower()], core.options.username) with open(cache_file, 'r') as handle: cache = handle.read() # Verify backend job ID in cache for multiple formats between the different # versions of the blahp. For blahp-1.18.16.bosco-1.osg32: # # 2: [BatchJobId="2"; WorkerNode="fermicloud171.fnal.gov-0"; JobStatus=4; ExitCode= 0; ]\n # # For blahp-1.18.25.bosco-1.osg33: # # 5347907 "(dp0 # S'BatchJobId' # p1 # S'""5347907""' # p2 # sS'WorkerNode' # p3 # S'""node1358""' # p4 # sS'JobStatus' # p5 # S'2' # p6 # s." self.assert_(re.search(r'BatchJobId[=\s"\'p1S]+%s' % backend_jobid, cache), 'Job %s not found in %s blahp cache:\n%s' % (backend_jobid, lrms.upper(), cache)) os.chdir(cwd)
def test_07_ping_with_gums(self): core.state['condor-ce.gums-auth'] = False self.general_requirements() core.skip_ok_unless_installed('gums-service') # Setting up GUMS auth using the instructions here: # https://opensciencegrid.github.io/docs/compute-element/install-htcondor-ce/#authentication-with-gums hostname = core.get_hostname() lcmaps_contents = '''gumsclient = "lcmaps_gums_client.mod" "-resourcetype ce" "-actiontype execute-now" "-capath /etc/grid-security/certificates" "-cert /etc/grid-security/hostcert.pem" "-key /etc/grid-security/hostkey.pem" "--cert-owner root" # Change this URL to your GUMS server "--endpoint https://%s:8443/gums/services/GUMSXACMLAuthorizationServicePort" verifyproxy = "lcmaps_verify_proxy.mod" "--allow-limited-proxy" " -certdir /etc/grid-security/certificates" # lcmaps policies require at least two modules, so these are here to # fill in if only one module is needed. "good | bad" has no effect. good = "lcmaps_dummy_good.mod" bad = "lcmaps_dummy_bad.mod" authorize_only: ## Policy 1: GUMS but not SAZ (most common, default) gumsclient -> good | bad ''' % hostname gums_properties_contents = '''gums.location=https://%s:8443/gums/services/GUMSAdmin gums.authz=https://%s:8443/gums/services/GUMSXACMLAuthorizationServicePort ''' % (hostname, hostname) core.config['condor-ce.lcmapsdb'] = '/etc/lcmaps.db' core.config['condor-ce.gums-properties'] = '/etc/gums/gums-client.properties' core.config['condor-ce.gsi-authz'] = '/etc/grid-security/gsi-authz.conf' files.write(core.config['condor-ce.lcmapsdb'], lcmaps_contents, owner='condor-ce.gums') files.write(core.config['condor-ce.gums-properties'], gums_properties_contents, owner='condor-ce') files.replace(core.config['condor-ce.gsi-authz'], '# globus_mapping liblcas_lcmaps_gt4_mapping.so lcmaps_callout', 'globus_mapping liblcas_lcmaps_gt4_mapping.so lcmaps_callout', owner='condor-ce') try: core.state['condor-ce.gums-auth'] = True service.check_stop('condor-ce') stat = core.get_stat(core.config['condor-ce.collectorlog']) service.check_start('condor-ce') # Wait for the schedd to come back up self.failUnless(condor.wait_for_daemon(core.config['condor-ce.collectorlog'], stat, 'Schedd', 300.0), 'Schedd failed to restart within the 1 min window') command = ('condor_ce_ping', 'WRITE', '-verbose') stdout, _, _ = core.check_system(command, 'ping using GSI and gridmap', user=True) self.assert_(re.search(r'Authorized:\s*TRUE', stdout), 'could not authorize with GSI') finally: files.restore(core.config['condor-ce.lcmapsdb'], 'condor-ce.gums') files.restore(core.config['condor-ce.gsi-authz'], 'condor-ce') files.restore(core.config['condor-ce.gums-properties'], 'condor-ce')
def test_04_start_pbs(self): if core.el_release() <= 6: core.config['torque.pbs-lockfile'] = '/var/lock/subsys/pbs_server' else: core.config['torque.pbs-lockfile'] = '/var/lib/torque/server_priv/server.lock' core.state['trqauthd.started-service'] = False core.state['torque.pbs-server-running'] = False core.state['torque.pbs-server-started'] = False core.state['torque.pbs-configured'] = False core.state['torque.nodes-up'] = False core.config['torque.pbs-nodes-file'] = '/var/lib/torque/server_priv/nodes' core.config['torque.pbs-servername-file'] = '/var/lib/torque/server_name' core.skip_ok_unless_installed(*self.required_rpms) if os.path.exists(core.config['torque.pbs-lockfile']): core.state['torque.pbs-server-running'] = True self.skip_ok('pbs server apparently running') # set hostname as servername instead of localhost files.write(core.config['torque.pbs-servername-file'], "%s" % core.get_hostname(), owner='pbs') core.state['torque.pbs-configured'] = True # trqauthd is required for the pbs_server service.start('trqauthd') if not os.path.exists('/var/lib/torque/server_priv/serverdb'): if core.el_release() <= 6: command = 'service pbs_server create' # this creates the default config and starts the service else: # XXX: "service pbs_server create" doesn't work for systemd, and I haven't found a # systemd equivalent to do the "create" step in el7 ... The following was # distilled from the el6 init.d script: (but please correct as necessary) command = ('/usr/sbin/pbs_server -d /var/lib/torque -t create -f && ' 'sleep 10 && /usr/bin/qterm') stdout, _, fail = core.check_system(command, 'create initial pbs serverdb config', shell=True) self.assert_(stdout.find('error') == -1, fail) # This gets wiped if we write it before the initial 'service pbs_server create' # However, this file needs to be in place before the service is started so we # restart the service after 'initial configuration' files.write(core.config['torque.pbs-nodes-file'], # add the local node as a compute node "%s np=1 num_node_boards=1\n" % core.get_hostname(), owner='pbs') # Sometimes the restart command throws an error on stop but still manages # to kill the service, meaning that the service doesn't get brought back up command = ('service', 'pbs_server', 'stop') core.system(command, 'stop pbs server daemon') server_log = '/var/log/torque/server_logs/' + date.today().strftime('%Y%m%d') try: server_log_stat = os.stat(server_log) except OSError: server_log_stat = None command = ('service', 'pbs_server', 'start') stdout, _, fail = core.check_system(command, 'Start pbs server daemon') self.assert_(stdout.find('error') == -1, fail) self.assert_(os.path.exists(core.config['torque.pbs-lockfile']), 'pbs server run lock file missing') core.state['torque.pbs-server-started'] = True core.state['torque.pbs-server-running'] = True # Wait until the server is up before writing the rest of the config core.monitor_file(server_log, server_log_stat, '.*Server Ready.*', 60.0) core.check_system("echo '%s' | qmgr %s" % (self.pbs_config, core.get_hostname()), "Configuring pbs server", shell=True) # wait up to 5 minutes for the server to recognize the node start_time = time.time() while (time.time() - start_time) < 600: command = ('/usr/bin/qnodes', '-s', core.get_hostname()) stdout, _, fail = core.check_system(command, 'Get pbs node info') self.assert_(stdout.find('error') == -1, fail) if stdout.find('state = free'): core.state['torque.nodes-up'] = True break if not core.state['torque.nodes-up']: self.fail('PBS nodes not coming up')
def contact_string(self, jobmanager): return core.get_hostname() + '/jobmanager-' + jobmanager
import osgtest.library.core as core import osgtest.library.files as files import osgtest.library.mysql as mysql import osgtest.library.osgunittest as osgunittest import osgtest.library.service as service import time CLUSTER_NAME = 'osg_test' CTLD_LOG = '/var/log/slurm/slurmctld.log' SLURM_LOG = '/var/log/slurm/slurm.log' SHORT_HOSTNAME = core.get_hostname().split('.')[0] SLURMDBD_CONFIG = """AuthType=auth/munge DbdHost=localhost SlurmUser=slurm DebugLevel=debug5 LogFile=/var/log/slurm/slurmdbd.log StorageType=accounting_storage/mysql StorageLoc=%(name)s StorageUser=%(user)s StoragePass=%(pass)s """ SLURM_CONFIG = """AccountingStorageHost=localhost AccountingStorageLoc=/tmp/slurm_job_accounting.txt AccountingStorageType=accounting_storage/slurmdbd AuthType=auth/munge ClusterName=%(cluster)s ControlMachine=%(short_hostname)s JobAcctGatherType=jobacct_gather/linux
import os import socket import shutil import tempfile import pwd import osgtest.library.core as core import osgtest.library.files as files import osgtest.library.osgunittest as osgunittest import osgtest.library.service as service import osgtest.library.xrootd as xrootd ERR_AUTH_FAIL = 52 ERR_PERMISSION_DENIED = 54 HOSTNAME = core.get_hostname() or "localhost" def xrootd_record_failure(fn): """Decorator for xrootd tests that sets the core.state['xrootd.had-failures'] flag if there were any test failures. """ def inner(*args, **kwargs): try: return fn(*args, **kwargs) except (osgunittest.OkSkipException, osgunittest.BadSkipException, osgunittest.ExcludedException): raise except AssertionError: core.state['xrootd.had-failures'] = True