def test_remote_set_param(self, collector_ad): rparam = htcondor.RemoteParam(collector_ad) assert "OOF" not in rparam rparam["OOF"] = "BAR" htcondor.send_command(collector_ad, htcondor.DaemonCommands.Reconfig) rparam2 = htcondor.RemoteParam(collector_ad) assert "OOF" in rparam2
def __init__(self, logger, annex_name, **options): if not htcondor.param.get("HPC_ANNEX_ENABLED", False): raise ValueError("HPC Annex functionality has not been enabled by your HTCondor administrator.") annex_collector = htcondor.param.get("ANNEX_COLLECTOR", "htcondor-cm-hpcannex.osgdev.chtc.io") collector = htcondor.Collector(annex_collector) token_file = create_annex_token(logger, "shutdown") atexit.register(lambda: os.unlink(token_file)) location_ads = collector.query( ad_type=htcondor.AdTypes.Master, constraint=f'AnnexName =?= "{annex_name}"', ) if len(location_ads) == 0: print(f"No resources found in annex '{annex_name}'.") return password_file = htcondor.param.get("ANNEX_PASSWORD_FILE", "~/.condor/annex_password_file") password_file = os.path.expanduser(password_file) # There's a bug here where I should be able to write # with htcondor.SecMan() as security_context: # instead, but then security_context is a `lockedContext` object # which doesn't have a `setConfig` attribute. security_context = htcondor.SecMan() with security_context: security_context.setConfig("SEC_DEFAULT_AUTHENTICATION_METHODS", "FS IDTOKENS PASSWORD") security_context.setConfig("SEC_PASSWORD_FILE", password_file) print(f"Shutting down annex '{annex_name}'...") for location_ad in location_ads: htcondor.send_command( location_ad, htcondor.DaemonCommands.OffFast, "MASTER", ) print(f"... each resource in '{annex_name}' has been commanded to shut down.") print("It may take some time for each resource to finish shutting down."); print("Annex requests that are still in progress have not been affected.")
def testRemoteSetParam(self): os.environ["_condor_SETTABLE_ATTRS_READ"] = "FOO" os.environ["_condor_ENABLE_RUNTIME_CONFIG"] = "TRUE" self.launch_daemons(["COLLECTOR"]) del os.environ["_condor_SETTABLE_ATTRS_READ"] #htcondor.param["TOOL_DEBUG"] = "D_NETWORK|D_SECURITY" htcondor.enable_debug() coll = htcondor.Collector() coll_ad = coll.locate(htcondor.DaemonTypes.Collector) rparam = htcondor.RemoteParam(coll_ad) self.assertTrue("FOO" not in rparam) rparam["FOO"] = "BAR" htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig) rparam2 = htcondor.RemoteParam(coll_ad) self.assertTrue(rparam2.get("FOO")) self.assertTrue("FOO" in rparam2) self.assertEquals(rparam2["FOO"], "BAR") del rparam["FOO"] rparam2.refresh() htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig) self.assertTrue("FOO" not in rparam2) self.assertTrue(("ENABLE_CHIRP_DELAYED", "true") in rparam2.items())
def testRemoteSetParam(self): os.environ["_condor_SETTABLE_ATTRS_READ"] = "FOO" os.environ["_condor_ENABLE_RUNTIME_CONFIG"] = "TRUE" self.launch_daemons(["COLLECTOR"]) del os.environ["_condor_SETTABLE_ATTRS_READ"] #htcondor.param["TOOL_DEBUG"] = "D_NETWORK|D_SECURITY" htcondor.enable_debug() coll = htcondor.Collector() coll_ad = coll.locate(htcondor.DaemonTypes.Collector) rparam = htcondor.RemoteParam(coll_ad) self.assertTrue("FOO" not in rparam) rparam["FOO"] = "BAR" htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig) rparam2 = htcondor.RemoteParam(coll_ad) self.assertTrue(rparam2.get("FOO")) self.assertTrue("FOO" in rparam2) self.assertEquals(rparam2["FOO"], "BAR") del rparam["FOO"] rparam2.refresh() htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig) self.assertTrue("FOO" not in rparam2) self.assertTrue(("ENABLE_CHIRP_DELAYED", "true") in rparam2.items())
def collector_command_consumer(testrun=False): collector_commands_key = config.collector_commands_key sleep_interval = config.command_sleep_interval while (True): try: redis_con = setup_redis_connection() command_string = redis_con.lpop(collector_commands_key) if command_string is not None: command_dict = json.loads(command_string) #execute command # use htcondor class's send_command function to send condor_off -peaceful to Startd and Master # order matters here, we need to issue the command to Startd first then Master # We will need the class ad for the machine found by using ad = Collector.locate(...) # then do htcondor.send_command(ad=ad, dc=htcondor.DaemonCommands.DaemonsOffPeaceful, target="-daemon Startd") # htcondor.send_command(ad=ad, dc=htcondor.DaemonCommands.DaemonsOffPeaceful, target="-daemon Master") # may not need the target #need to get machine identifier out of command machine_name = command_dict['machine_name'].encode( 'ascii', 'ignore') command = command_dict['command'] if command == "condor_off": condor_c = htcondor.Collector() logging.info("getting machine ads for %s" % machine_name) startd_ad = condor_c.locate(htcondor.DaemonTypes.Startd, machine_name) logging.info("found startd.. locating master") master_machine_name = machine_name.split("@")[1] master_ad = condor_c.locate(htcondor.DaemonTypes.Master, master_machine_name) logging.info("Ads found, issuing condor_off commands...") htcondor.send_command( startd_ad, htcondor.DaemonCommands.SetPeacefulShutdown) htcondor.send_command( master_ad, htcondor.DaemonCommands.SetPeacefulShutdown) if (testrun): return True else: logging.error("Unrecognized command") if (testrun): return False else: logging.info( "No command in redis list, begining sleep interval...") #only sleep if there was no command if (testrun): return False time.sleep(sleep_interval) except Exception as e: logging.error( "Failure connecting to redis or executing condor command...") logging.error(e) if (testrun): return False time.sleep(sleep_interval) except (SystemExit, KeyboardInterrupt): return False
def reconfig_collector(self, opt=[]): if self.test: return (-1, "", "") for coll_ad in htcondor.Collector().locateAll( htcondor.DaemonTypes.Collector): htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig)
def update_collector_wn_list(self, wn_list): for coll_ad in htcondor.Collector().locateAll( htcondor.DaemonTypes.Collector): htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig) param = htcondor.RemoteParam(coll_ad) param.update('WNS', wn_list)
def command_poller(): multiprocessing.current_process().name = "Command Poller" condor_host = socket.gethostname() # database setup config = Config('/etc/cloudscheduler/cloudscheduler.yaml', os.path.basename(sys.argv[0])) Resource = config.db_map.classes.condor_machines GROUPS = config.db_map.classes.csv2_groups try: while True: logging.info("Beginning command consumer cycle") config.db_open() db_session = config.db_session groups = db_session.query(GROUPS) condor_hosts_set = set( ) # use a set here so we dont re-query same host if multiple groups have same host for group in groups: condor_hosts_set.add(group.condor_central_manager) uncommitted_updates = 0 for condor_host in condor_hosts_set: try: condor_session = htcondor.Collector(condor_host) except Exception as exc: logging.exception( "Failed to locate condor daemon, skipping...:") logging.error(exc) continue master_type = htcondor.AdTypes.Master startd_type = htcondor.AdTypes.Startd # Query database for machines to be retired. abort_cycle = False for resource in db_session.query(Resource).filter( Resource.condor_host == condor_host, Resource.retire_request_time > Resource.retired_time): logging.info("Retiring machine %s" % resource.name) try: condor_classad = condor_session.query( master_type, 'Name=="%s"' % resource.name.split("@")[1])[0] master_result = htcondor.send_command( condor_classad, htcondor.DaemonCommands.DaemonsOffPeaceful) resource.retired_time = int(time.time()) db_session.merge(resource) uncommitted_updates = uncommitted_updates + 1 if uncommitted_updates >= config.batch_commit_size: try: db_session.commit() uncommitted_updates = 0 except Exception as exc: logging.exception( "Failed to commit batch of retired machines, aborting cycle..." ) logging.error(exc) abort_cycle = True break except Exception as exc: logging.exception( "Failed to retire machine, rebooting command poller..." ) logging.error(exc) exit(1) if uncommitted_updates > 0: try: db_session.commit() except Exception as exc: logging.exception( "Failed to commit retire machine, aborting cycle...") logging.error(exc) del condor_session config.db_close() del db_session time.sleep(config.sleep_interval_command) continue for condor_host in condor_hosts_set: # Query database for machines with no associated VM. master_list = [] startd_list = [] redundant_machine_list = db_session.query( view_redundant_machines).filter( view_redundant_machines.c.condor_host == condor_host) for resource in redundant_machine_list: logging.info("Removing classads for machine %s" % resource.name) try: condor_classad = condor_session.query( master_type, 'Name=="%s"' % resource.name.split("@")[1])[0] master_list.append(condor_classad) condor_classad = condor_session.query( startd_type, 'Name=="%s"' % resource.name)[0] startd_list.append(condor_classad) except IndexError as exc: pass except Exception as exc: logging.exception( "Failed to retrieve machine classads, aborting...") logging.error(exc) abort_cycle = True break if abort_cycle: abort_cycle = False continue # Execute condor_advertise to remove classads. if startd_list: startd_advertise_result = condor_session.advertise( startd_list, "INVALIDATE_STARTD_ADS") logging.info("condor_advertise result for startd ads: %s", startd_advertise_result) if master_list: master_advertise_result = condor_session.advertise( master_list, "INVALIDATE_MASTER_ADS") logging.info("condor_advertise result for master ads: %s", master_advertise_result) logging.info("Completed command consumer cycle") del condor_session config.db_close(commit=True) del db_session time.sleep(config.sleep_interval_command) except Exception as exc: logging.exception( "Command consumer while loop exception, process terminating...") logging.error(exc) config.db_close() del db_session