def setUp(self): self.pid = -1 os.environ["_condor_MASTER"] = os.path.join(os.getcwd(), "../condor_master.V6/condor_master") os.environ["_condor_COLLECTOR"] = os.path.join(os.getcwd(), "../condor_collector.V6/condor_collector") os.environ["_condor_SCHEDD"] = os.path.join(os.getcwd(), "../condor_schedd.V6/condor_schedd") os.environ["_condor_PROCD"] = os.path.join(os.getcwd(), "../condor_procd/condor_procd") os.environ["_condor_STARTD"] = os.path.join(os.getcwd(), "../condor_startd.V6/condor_startd") os.environ["_condor_STARTER"] = os.path.join(os.getcwd(), "../condor_starter.V6.1/condor_starter") os.environ["_condor_NEGOTIATOR"] = os.path.join(os.getcwd(), "../condor_negotiator.V6/condor_negotiator") os.environ["_condor_SHADOW"] = os.path.join(os.getcwd(), "../condor_shadow.V6.1/condor_shadow") os.environ["_condor_CONDOR_HOST"] = socket.getfqdn() os.environ["_condor_LOCAL_DIR"] = testdir os.environ["_condor_LOG"] = '$(LOCAL_DIR)/log' os.environ["_condor_LOCK"] = '$(LOCAL_DIR)/lock' os.environ["_condor_RUN"] = '$(LOCAL_DIR)/run' os.environ["_condor_COLLECTOR_NAME"] = "python_classad_tests" os.environ["_condor_SCHEDD_NAME"] = "python_classad_tests" os.environ["_condor_MASTER_ADDRESS_FILE"] = "$(LOG)/.master_address" os.environ["_condor_COLLECTOR_ADDRESS_FILE"] = "$(LOG)/.collector_address" os.environ["_condor_SCHEDD_ADDRESS_FILE"] = "$(LOG)/.schedd_address" os.environ["_condor_STARTD_ADDRESS_FILE"] = "$(LOG)/.startd_address" os.environ["_condor_NEGOTIATOR_ADDRESS_FILE"] = "$(LOG)/.negotiator_address" # Various required attributes for the startd os.environ["_condor_START"] = "TRUE" os.environ["_condor_SUSPEND"] = "FALSE" os.environ["_condor_CONTINUE"] = "TRUE" os.environ["_condor_PREEMPT"] = "FALSE" os.environ["_condor_KILL"] = "FALSE" os.environ["_condor_WANT_SUSPEND"] = "FALSE" os.environ["_condor_WANT_VACATE"] = "FALSE" os.environ["_condor_MachineMaxVacateTime"] = "5" htcondor.reload_config() htcondor.SecMan().invalidateAllSessions()
def __init__(self, submissionHost, *args, **kwargs): self.submissionHost = submissionHost # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.__init__') # Initialize tmpLog.debug('Initializing client') self.lock = threading.Lock() self.condor_api = CONDOR_API self.condor_schedd = None self.condor_pool = None # Parse condor command remote options from workspec if self.submissionHost in ('LOCAL', 'None'): tmpLog.debug('submissionHost is {0}, treated as local schedd. Skipped'.format(self.submissionHost)) else: try: self.condor_schedd, self.condor_pool = self.submissionHost.split(',')[0:2] except ValueError: tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(self.submissionHost)) # Use Python API or fall back to command if self.condor_api == 'python': try: self.secman = htcondor.SecMan() self.renew_session(init=True) except Exception as e: tmpLog.error('Error when using htcondor Python API. Exception {0}: {1}'.format(e.__class__.__name__, e)) raise tmpLog.debug('Initialized client')
def __init__(self, *args, **kwargs): # Make logger tmpLog = core_utils.make_logger(baseLogger, method_name='CondorJobQuery.__init__') # Initialize with self.classLock: tmpLog.debug('Start') self.submissionHost = str(kwargs.get('id')) self.lock = threading.Lock() self.condor_api = CONDOR_API self.condor_schedd = None self.condor_pool = None if self.submissionHost: try: self.condor_schedd, self.condor_pool = self.submissionHost.split( ',')[0:2] except ValueError: tmpLog.error( 'Invalid submissionHost: {0} . Skipped'.format( self.submissionHost)) if self.condor_api == 'python': try: self.secman = htcondor.SecMan() self.renew_session() except Exception as e: self.condor_api = 'command' tmpLog.warning( 'Using condor command instead due to exception from unsupported version of python or condor api: {0}' .format(e)) tmpLog.debug('Initialize done')
def ping_authz(token, today): collector = current_app.config.get("COLLECTOR", "flock.opensciencegrid.org") # We are sufficiently friendly with the CHTC collector that, if we see a token from there, # use that collector instead of the OSG one. This allows CHTC glideins to send logs to the # OSPool syslog service. Mostly, this allows testing without disturbing the OSPool. token_pieces = token.split(".") if len(token_pieces) == 3: try: payload = base64.b64decode(token_pieces[1] + "=" * (4 - (len(token_pieces[1]) % 4))) payload = json.loads(payload) if payload.get("iss") == "cm.chtc.wisc.edu": collector = "glidein-cm.chtc.wisc.edu" except: pass addrs = socket.getaddrinfo(collector, 9618, socket.AF_INET, socket.SOCK_STREAM)[0][-1] myaddr = f"<{addrs[0]}:{addrs[1]}>" with htcondor.SecMan() as secman: secman.setToken(htcondor.Token(token)) return dict(secman.ping(myaddr))
def setUp(self): self.pid = -1 to_delete = [i for i in os.environ if i.lower().startswith("_condor_")] for key in to_delete: del os.environ[key] os.environ["_condor_MASTER"] = os.path.join( os.getcwd(), "../condor_master.V6/condor_master") os.environ["_condor_COLLECTOR"] = os.path.join( os.getcwd(), "../condor_collector.V6/condor_collector") os.environ["_condor_SCHEDD"] = os.path.join( os.getcwd(), "../condor_schedd.V6/condor_schedd") os.environ["_condor_PROCD"] = os.path.join( os.getcwd(), "../condor_procd/condor_procd") os.environ["_condor_STARTD"] = os.path.join( os.getcwd(), "../condor_startd.V6/condor_startd") os.environ["_condor_STARTER"] = os.path.join( os.getcwd(), "../condor_starter.V6.1/condor_starter") os.environ["_condor_NEGOTIATOR"] = os.path.join( os.getcwd(), "../condor_negotiator.V6/condor_negotiator") os.environ["_condor_SHADOW"] = os.path.join( os.getcwd(), "../condor_shadow.V6.1/condor_shadow") os.environ["_condor_SHARED_PORT"] = os.path.join( os.getcwd(), "../condor_shared_port/condor_shared_port") os.environ["_condor_CONDOR_HOST"] = socket.getfqdn() os.environ["_condor_LOCAL_DIR"] = testdir os.environ["_condor_LOG"] = '$(LOCAL_DIR)/log' os.environ["_condor_LOCK"] = '$(LOCAL_DIR)/lock' os.environ["_condor_RUN"] = '$(LOCAL_DIR)/run' os.environ["_condor_COLLECTOR_NAME"] = "python_classad_tests" os.environ["_condor_SCHEDD_NAME"] = "python_classad_tests" os.environ["_condor_MASTER_ADDRESS_FILE"] = "$(LOG)/.master_address" os.environ[ "_condor_COLLECTOR_ADDRESS_FILE"] = "$(LOG)/.collector_address" os.environ["_condor_SCHEDD_ADDRESS_FILE"] = "$(LOG)/.schedd_address" os.environ["_condor_STARTD_ADDRESS_FILE"] = "$(LOG)/.startd_address" os.environ["_condor_STARTD_DEBUG"] = "D_FULLDEBUG" os.environ["_condor_STARTER_DEBUG"] = "D_FULLDEBUG" os.environ["_condor_SHADOW_DEBUG"] = "D_FULLDEBUG|D_MACHINE" os.environ[ "_condor_NEGOTIATOR_ADDRESS_FILE"] = "$(LOG)/.negotiator_address" os.environ["_condor_NEGOTIATOR_CYCLE_DELAY"] = "1" os.environ["_condor_NEGOTIATOR_INTERVAL"] = "1" os.environ["_condor_SCHEDD_INTERVAL"] = "1" os.environ["_condor_SCHEDD_MIN_INTERVAL"] = "1" os.environ["_condor_CONDOR_FSYNC"] = "FALSE" # Various required attributes for the startd os.environ["_condor_START"] = "TRUE" os.environ["_condor_SUSPEND"] = "FALSE" os.environ["_condor_CONTINUE"] = "TRUE" os.environ["_condor_PREEMPT"] = "FALSE" os.environ["_condor_KILL"] = "FALSE" os.environ["_condor_WANT_SUSPEND"] = "FALSE" os.environ["_condor_WANT_VACATE"] = "FALSE" os.environ["_condor_MachineMaxVacateTime"] = "5" os.environ["_condor_JOB_INHERITS_STARTER_ENVIRONMENT"] = "TRUE" htcondor.reload_config() htcondor.SecMan().invalidateAllSessions()
def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201 """ Submit directly to the schedd using the HTCondor module """ dagAd = classad.ClassAd() addCRABInfoToClassAd(dagAd, info) # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker dagAd["CRAB_Attempt"] = 0 dagAd["JobUniverse"] = 12 dagAd["HoldKillSig"] = "SIGUSR1" dagAd["Out"] = os.path.join(info['scratch'], "request.out") dagAd["Err"] = os.path.join(info['scratch'], "request.err") dagAd["Cmd"] = cmd dagAd['Args'] = arg dagAd["TransferInput"] = info['inputFilesString'] dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))") dagAd["TransferOutput"] = info['outputFilesString'] dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))") dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId") dagAd["RemoveKillSig"] = "SIGUSR1" dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CONDOR_ID=", ClusterId, ".", ProcId)') dagAd["RemoteCondorSetup"] = info['remote_condor_setup'] dagAd["Requirements"] = classad.ExprTree('true || false') dagAd["TaskType"] = "ROOT" dagAd["X509UserProxy"] = info['userproxy'] r, w = os.pipe() rpipe = os.fdopen(r, 'r') wpipe = os.fdopen(w, 'w') if os.fork() == 0: #pylint: disable=W0212 try: rpipe.close() try: resultAds = [] htcondor.SecMan().invalidateAllSessions() os.environ['X509_USER_PROXY'] = info['userproxy'] schedd.submit(dagAd, 1, True, resultAds) schedd.spool(resultAds) wpipe.write("OK") wpipe.close() os._exit(0) except Exception: #pylint: disable=W0703 wpipe.write(str(traceback.format_exc())) finally: os._exit(1) wpipe.close() results = rpipe.read() if results != "OK": raise Exception("Failure when submitting HTCondor task: %s" % results) schedd.reschedule()
def __enter__(self): self.r, self.w = os.pipe() self.rpipe = os.fdopen(self.r, 'r') self.wpipe = os.fdopen(self.w, 'w') self.pid = os.fork() if self.pid == 0: htcondor.SecMan().invalidateAllSessions() htcondor.param['SEC_CLIENT_AUTHENTICATION_METHODS'] = 'FS,GSI' htcondor.param['DELEGATE_FULL_JOB_GSI_CREDENTIALS'] = 'true' htcondor.param['DELEGATE_JOB_GSI_CREDENTIALS_LIFETIME'] = '0' os.environ['X509_USER_PROXY'] = self.proxy self.rpipe.close() else: self.wpipe.close() return self.pid, self.rpipe
def __init__(self, cacheEnable=False, cacheRefreshInterval=None, useCondorHistory=True, *args, **kwargs): self.submissionHost = str(kwargs.get('id')) # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format( self.submissionHost), method_name='CondorJobQuery.__init__') # Initialize with self.classLock: tmpLog.debug('Start') self.lock = threading.Lock() self.condor_api = CONDOR_API self.condor_schedd = None self.condor_pool = None self.cacheEnable = False if self.submissionHost in ('LOCAL', 'None'): tmpLog.debug( 'submissionHost is {0}, treated as local schedd. Skipped'. format(self.submissionHost)) else: try: self.condor_schedd, self.condor_pool = self.submissionHost.split( ',')[0:2] except ValueError: tmpLog.error( 'Invalid submissionHost: {0} . Skipped'.format( self.submissionHost)) if self.condor_api == 'python': try: self.secman = htcondor.SecMan() self.renew_session() except Exception as e: self.condor_api = 'command' tmpLog.warning( 'Using condor command instead due to exception from unsupported version of python or condor api: {0}' .format(e)) self.cacheEnable = cacheEnable if self.cacheEnable: self.cache = ([], 0) self.cacheRefreshInterval = cacheRefreshInterval self.useCondorHistory = useCondorHistory tmpLog.debug('Initialize done')
def setUp(self): self.pid = -1 os.environ["_condor_MASTER"] = os.path.join(os.getcwd(), "../condor_master.V6/condor_master") os.environ["_condor_COLLECTOR"] = os.path.join(os.getcwd(), "../condor_collector.V6/condor_collector") os.environ["_condor_SCHEDD"] = os.path.join(os.getcwd(), "../condor_schedd.V6/condor_schedd") os.environ["_condor_PROCD"] = os.path.join(os.getcwd(), "../condor_procd/condor_procd") os.environ["_condor_STARTD"] = os.path.join(os.getcwd(), "../condor_startd.V6/condor_startd") os.environ["_condor_STARTER"] = os.path.join(os.getcwd(), "../condor_starter.V6.1/condor_starter") os.environ["_condor_NEGOTIATOR"] = os.path.join(os.getcwd(), "../condor_negotiator.V6/condor_negotiator") os.environ["_condor_SHADOW"] = os.path.join(os.getcwd(), "../condor_shadow.V6.1/condor_shadow") os.environ["_condor_STARTER.PLUGINS"] = os.path.join(os.getcwd(), "../condor_contrib/lark/lark-plugin.so") os.environ["_condor_USE_NETWORK_NAMESPACES"] = "TRUE" #os.environ["_condor_LARK_NETWORK_ACCOUNTING"] = "TRUE" #now make the default configuration to be "bridge" #os.environ["_condor_STARTD_ATTRS"] = "LarkNetworkType, LarkAddressType, LarkBridgeDevice" #os.environ["_condor_LarkNetworkType"] = "bridge" #os.environ["_condor_LarkNetBridgeDevice"] = "eth0" #os.environ["_condor_LarkAddressType"] = "dhcp" os.environ["_condor_CONDOR_HOST"] = socket.getfqdn() os.environ["_condor_LOCAL_DIR"] = testdir os.environ["_condor_LOG"] = '$(LOCAL_DIR)/log' os.environ["_condor_LOCK"] = '$(LOCAL_DIR)/lock' os.environ["_condor_RUN"] = '$(LOCAL_DIR)/run' os.environ["_condor_COLLECTOR_NAME"] = "python_classad_tests" os.environ["_condor_SCHEDD_NAME"] = "python_classad_tests" os.environ["_condor_MASTER_ADDRESS_FILE"] = "$(LOG)/.master_address" os.environ["_condor_COLLECTOR_ADDRESS_FILE"] = "$(LOG)/.collector_address" os.environ["_condor_SCHEDD_ADDRESS_FILE"] = "$(LOG)/.schedd_address" os.environ["_condor_STARTD_ADDRESS_FILE"] = "$(LOG)/.startd_address" os.environ["_condor_NEGOTIATOR_ADDRESS_FILE"] = "$(LOG)/.negotiator_address" # Various required attributes for the startd os.environ["_condor_START"] = "TRUE" os.environ["_condor_SUSPEND"] = "FALSE" os.environ["_condor_CONTINUE"] = "TRUE" os.environ["_condor_PREEMPT"] = "FALSE" os.environ["_condor_KILL"] = "FALSE" os.environ["_condor_WANT_SUSPEND"] = "FALSE" os.environ["_condor_WANT_VACATE"] = "FALSE" # Remember to check the correctness of network policy script path defined os.environ["_condor_STARTER_NETWORK_POLICY_SCRIPT_PATH"] = os.path.join(os.getcwd(), "../condor_contrib/lark/LarkNetworkPolicy/lark_network_policy.py") os.environ["_condor_STARTER_DEBUG"] = "D_FULLDEBUG" htcondor.reload_config() htcondor.SecMan().invalidateAllSessions()
def test_ping(self, collector_ad): assert "MyAddress" in collector_ad secman = htcondor.SecMan() authz_ad = secman.ping(collector_ad, "WRITE") assert "AuthCommand" in authz_ad assert authz_ad['AuthCommand'] == 60021 assert "AuthorizationSucceeded" in authz_ad assert authz_ad['AuthorizationSucceeded'] authz_ad = secman.ping(collector_ad["MyAddress"], "WRITE") assert "AuthCommand" in authz_ad assert authz_ad['AuthCommand'] == 60021 assert "AuthorizationSucceeded" in authz_ad assert authz_ad['AuthorizationSucceeded'] authz_ad = secman.ping(collector_ad["MyAddress"]) assert "AuthCommand" in authz_ad assert authz_ad['AuthCommand'] == 60011 assert "AuthorizationSucceeded" in authz_ad assert authz_ad['AuthorizationSucceeded']
def __init__(self, logger, annex_name, **options): if not htcondor.param.get("HPC_ANNEX_ENABLED", False): raise ValueError("HPC Annex functionality has not been enabled by your HTCondor administrator.") annex_collector = htcondor.param.get("ANNEX_COLLECTOR", "htcondor-cm-hpcannex.osgdev.chtc.io") collector = htcondor.Collector(annex_collector) token_file = create_annex_token(logger, "shutdown") atexit.register(lambda: os.unlink(token_file)) location_ads = collector.query( ad_type=htcondor.AdTypes.Master, constraint=f'AnnexName =?= "{annex_name}"', ) if len(location_ads) == 0: print(f"No resources found in annex '{annex_name}'.") return password_file = htcondor.param.get("ANNEX_PASSWORD_FILE", "~/.condor/annex_password_file") password_file = os.path.expanduser(password_file) # There's a bug here where I should be able to write # with htcondor.SecMan() as security_context: # instead, but then security_context is a `lockedContext` object # which doesn't have a `setConfig` attribute. security_context = htcondor.SecMan() with security_context: security_context.setConfig("SEC_DEFAULT_AUTHENTICATION_METHODS", "FS IDTOKENS PASSWORD") security_context.setConfig("SEC_PASSWORD_FILE", password_file) print(f"Shutting down annex '{annex_name}'...") for location_ad in location_ads: htcondor.send_command( location_ad, htcondor.DaemonCommands.OffFast, "MASTER", ) print(f"... each resource in '{annex_name}' has been commanded to shut down.") print("It may take some time for each resource to finish shutting down."); print("Annex requests that are still in progress have not been affected.")
def testPing(self): self.launch_daemons(["COLLECTOR"]) coll = htcondor.Collector() coll_ad = coll.locate(htcondor.DaemonTypes.Collector) self.assertTrue("MyAddress" in coll_ad) secman = htcondor.SecMan() authz_ad = secman.ping(coll_ad, "WRITE") self.assertTrue("AuthCommand" in authz_ad) self.assertEquals(authz_ad['AuthCommand'], 60021) self.assertTrue("AuthorizationSucceeded" in authz_ad) self.assertTrue(authz_ad['AuthorizationSucceeded']) authz_ad = secman.ping(coll_ad["MyAddress"], "WRITE") self.assertTrue("AuthCommand" in authz_ad) self.assertEquals(authz_ad['AuthCommand'], 60021) self.assertTrue("AuthorizationSucceeded" in authz_ad) self.assertTrue(authz_ad['AuthorizationSucceeded']) authz_ad = secman.ping(coll_ad["MyAddress"]) self.assertTrue("AuthCommand" in authz_ad) self.assertEquals(authz_ad['AuthCommand'], 60021) self.assertTrue("AuthorizationSucceeded" in authz_ad) self.assertTrue(authz_ad['AuthorizationSucceeded'])
def main(): opts = parse_opts() if not opts.cepool: coll = htcondor.Collector() else: coll = htcondor.Collector(opts.cepool) if not opts.name: schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd) else: schedd_ad = coll.locate(htcondor.DaemonTypes.Schedd, opts.name) sitename = htcondor.param.get('OSG_ResourceGroup', htcondor.param.get('HTCONDORCE_SiteName')) if not sitename: print >> sys.stderr, "Neither OSG_ResourceGroup nor HTCONDORCE_SiteName set in config file." sys.exit(1) batch = htcondor.param.get('OSG_BatchSystems', htcondor.param.get('HTCONDORCE_BatchSystem', 'Condor')).split(",")[0] status = htcondor.param.get('HTCONDORCE_Status', 'Production') hepspec_info = htcondor.param.get('HTCONDORCE_HEPSPEC_INFO').split('-')[0] if not hepspec_info: print >> sys.stderr, "HTCONDORCE_HEPSPEC_INFO not provided." sys.exit(1) cores = htcondor.param.get('HTCONDORCE_CORES') if not cores: print >> sys.stderr, "HTCONDORCE_CORES not available." sys.exit(1) schedd_name = schedd_ad['Name'] schedd = htcondor.Schedd(schedd_ad) query = schedd.xquery("x509userproxyvoname isnt undefined", ["JobStatus", "x509userproxyvoname"]) idle_vo_jobs = collections.defaultdict(int) running_vo_jobs = collections.defaultdict(int) total_vo_jobs = collections.defaultdict(int) for job in query: if not job.get("JobStatus") or not job.get("x509userproxyvoname"): continue total_vo_jobs[job['x509userproxyvoname']] += 1 if job['JobStatus'] == 1: idle_vo_jobs[job['x509userproxyvoname']] += 1 elif job['JobStatus'] == 2: running_vo_jobs[job['x509userproxyvoname']] += 1 idle_cores = 0 busy_cores = 0 total_cores = 0 leader_election = htcondor.param.get('HTCONDORCE_BDII_ELECTION') if leader_election == 'ZOOKEEPER': zkhosts = htcondor.param.get('HTCONDORCE_BDII_ZKHOSTS') leader = TimeStampLeader(zkhosts, "/htcondor/bdii_update") else: leader = PublicationLeader(htcondor.param.get('HTCONDORCE_BDII_LEADER')) leader.should_publish() poolcoll = htcondor.Collector(opts.pool) hosts = poolcoll.query(htcondor.AdTypes.Collector, True, ['HostsUnclaimed'])[0] total_instances = hosts.get('HostsUnclaimed', 0) if not leader.leader(): total_instances = 0 for ad in poolcoll.query(htcondor.AdTypes.Startd, 'State=!="Owner"', ["State", "Cpus"]): if not ad.get('State') or not ad.get('Cpus') or not leader.leader(): continue total_cores += ad['Cpus'] if ad['State'] == 'Unclaimed': idle_cores += ad['Cpus'] elif ad['State'] == 'Claimed': busy_cores += ad['Cpus'] vonames = set() vonames.update(idle_vo_jobs.keys()) vonames.update(running_vo_jobs.keys()) vonames.update([i.strip() for i in re.split('\s*,?\s*', htcondor.param.get('HTCONDORCE_VONAMES', '')) if i]) try: if htcondor.SecMan().ping(schedd_ad, "READ")['AuthorizationSucceeded']: cestatus = 'OK' else: cestatus = 'CRITICAL' cestatusinfo = 'Authorization ping successful' except Exception, e: cestatus = 'UNKNOWN' cestatusinfo = 'Authorization ping failed: %s' % str(e)