def test_remote_set_param(self, collector_ad):
     rparam = htcondor.RemoteParam(collector_ad)
     assert "OOF" not in rparam
     rparam["OOF"] = "BAR"
     htcondor.send_command(collector_ad, htcondor.DaemonCommands.Reconfig)
     rparam2 = htcondor.RemoteParam(collector_ad)
     assert "OOF" in rparam2
示例#2
0
    def __init__(self, logger, annex_name, **options):
        if not htcondor.param.get("HPC_ANNEX_ENABLED", False):
            raise ValueError("HPC Annex functionality has not been enabled by your HTCondor administrator.")

        annex_collector = htcondor.param.get("ANNEX_COLLECTOR", "htcondor-cm-hpcannex.osgdev.chtc.io")
        collector = htcondor.Collector(annex_collector)

        token_file = create_annex_token(logger, "shutdown")
        atexit.register(lambda: os.unlink(token_file))
        location_ads = collector.query(
            ad_type=htcondor.AdTypes.Master,
            constraint=f'AnnexName =?= "{annex_name}"',
        )

        if len(location_ads) == 0:
            print(f"No resources found in annex '{annex_name}'.")
            return

        password_file = htcondor.param.get("ANNEX_PASSWORD_FILE", "~/.condor/annex_password_file")
        password_file = os.path.expanduser(password_file)

        # There's a bug here where I should be able to write
        #   with htcondor.SecMan() as security_context:
        # instead, but then security_context is a `lockedContext` object
        # which doesn't have a `setConfig` attribute.
        security_context = htcondor.SecMan()
        with security_context:
            security_context.setConfig("SEC_DEFAULT_AUTHENTICATION_METHODS", "FS IDTOKENS PASSWORD")
            security_context.setConfig("SEC_PASSWORD_FILE", password_file)

            print(f"Shutting down annex '{annex_name}'...")
            for location_ad in location_ads:
                htcondor.send_command(
                    location_ad,
                    htcondor.DaemonCommands.OffFast,
                    "MASTER",
                )

        print(f"... each resource in '{annex_name}' has been commanded to shut down.")
        print("It may take some time for each resource to finish shutting down.");
        print("Annex requests that are still in progress have not been affected.")
示例#3
0
 def testRemoteSetParam(self):
     os.environ["_condor_SETTABLE_ATTRS_READ"] = "FOO"
     os.environ["_condor_ENABLE_RUNTIME_CONFIG"] = "TRUE"
     self.launch_daemons(["COLLECTOR"])
     del os.environ["_condor_SETTABLE_ATTRS_READ"]
     #htcondor.param["TOOL_DEBUG"] = "D_NETWORK|D_SECURITY"
     htcondor.enable_debug()
     coll = htcondor.Collector()
     coll_ad = coll.locate(htcondor.DaemonTypes.Collector)
     rparam = htcondor.RemoteParam(coll_ad)
     self.assertTrue("FOO" not in rparam)
     rparam["FOO"] = "BAR"
     htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig)
     rparam2 = htcondor.RemoteParam(coll_ad)
     self.assertTrue(rparam2.get("FOO"))
     self.assertTrue("FOO" in rparam2)
     self.assertEquals(rparam2["FOO"], "BAR")
     del rparam["FOO"]
     rparam2.refresh()
     htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig)
     self.assertTrue("FOO" not in rparam2)
     self.assertTrue(("ENABLE_CHIRP_DELAYED", "true") in rparam2.items())
示例#4
0
 def testRemoteSetParam(self):
     os.environ["_condor_SETTABLE_ATTRS_READ"] = "FOO"
     os.environ["_condor_ENABLE_RUNTIME_CONFIG"] = "TRUE"
     self.launch_daemons(["COLLECTOR"])
     del os.environ["_condor_SETTABLE_ATTRS_READ"]
     #htcondor.param["TOOL_DEBUG"] = "D_NETWORK|D_SECURITY"
     htcondor.enable_debug()
     coll = htcondor.Collector()
     coll_ad = coll.locate(htcondor.DaemonTypes.Collector)
     rparam = htcondor.RemoteParam(coll_ad)
     self.assertTrue("FOO" not in rparam)
     rparam["FOO"] = "BAR"
     htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig)
     rparam2 = htcondor.RemoteParam(coll_ad)
     self.assertTrue(rparam2.get("FOO"))
     self.assertTrue("FOO" in rparam2)
     self.assertEquals(rparam2["FOO"], "BAR")
     del rparam["FOO"]
     rparam2.refresh()
     htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig)
     self.assertTrue("FOO" not in rparam2)
     self.assertTrue(("ENABLE_CHIRP_DELAYED", "true") in rparam2.items())
示例#5
0
def collector_command_consumer(testrun=False):
    collector_commands_key = config.collector_commands_key
    sleep_interval = config.command_sleep_interval

    while (True):
        try:
            redis_con = setup_redis_connection()
            command_string = redis_con.lpop(collector_commands_key)

            if command_string is not None:
                command_dict = json.loads(command_string)
                #execute command
                # use htcondor class's send_command function to send condor_off -peaceful to Startd and Master
                # order matters here, we need to issue the command to Startd first then Master
                # We will need the class ad for the machine found by using ad = Collector.locate(...)
                # then do htcondor.send_command(ad=ad, dc=htcondor.DaemonCommands.DaemonsOffPeaceful, target="-daemon Startd")
                #  htcondor.send_command(ad=ad, dc=htcondor.DaemonCommands.DaemonsOffPeaceful, target="-daemon Master")
                # may not need the target

                #need to get machine identifier out of command
                machine_name = command_dict['machine_name'].encode(
                    'ascii', 'ignore')
                command = command_dict['command']
                if command == "condor_off":
                    condor_c = htcondor.Collector()
                    logging.info("getting machine ads for %s" % machine_name)
                    startd_ad = condor_c.locate(htcondor.DaemonTypes.Startd,
                                                machine_name)
                    logging.info("found startd.. locating master")
                    master_machine_name = machine_name.split("@")[1]
                    master_ad = condor_c.locate(htcondor.DaemonTypes.Master,
                                                master_machine_name)

                    logging.info("Ads found, issuing condor_off commands...")
                    htcondor.send_command(
                        startd_ad, htcondor.DaemonCommands.SetPeacefulShutdown)
                    htcondor.send_command(
                        master_ad, htcondor.DaemonCommands.SetPeacefulShutdown)
                    if (testrun):
                        return True

                else:
                    logging.error("Unrecognized command")
                    if (testrun):
                        return False

            else:
                logging.info(
                    "No command in redis list, begining sleep interval...")
                #only sleep if there was no command
                if (testrun):
                    return False
                time.sleep(sleep_interval)

        except Exception as e:
            logging.error(
                "Failure connecting to redis or executing condor command...")
            logging.error(e)
            if (testrun):
                return False
            time.sleep(sleep_interval)

        except (SystemExit, KeyboardInterrupt):
            return False
示例#6
0
文件: condor.py 项目: rcmdnk/gcpm
 def reconfig_collector(self, opt=[]):
     if self.test:
         return (-1, "", "")
     for coll_ad in htcondor.Collector().locateAll(
             htcondor.DaemonTypes.Collector):
         htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig)
示例#7
0
文件: condor.py 项目: rcmdnk/gcpm
 def update_collector_wn_list(self, wn_list):
     for coll_ad in htcondor.Collector().locateAll(
             htcondor.DaemonTypes.Collector):
         htcondor.send_command(coll_ad, htcondor.DaemonCommands.Reconfig)
         param = htcondor.RemoteParam(coll_ad)
         param.update('WNS', wn_list)
示例#8
0
def command_poller():
    multiprocessing.current_process().name = "Command Poller"
    condor_host = socket.gethostname()
    # database setup
    config = Config('/etc/cloudscheduler/cloudscheduler.yaml',
                    os.path.basename(sys.argv[0]))

    Resource = config.db_map.classes.condor_machines
    GROUPS = config.db_map.classes.csv2_groups

    try:
        while True:
            logging.info("Beginning command consumer cycle")
            config.db_open()
            db_session = config.db_session
            groups = db_session.query(GROUPS)
            condor_hosts_set = set(
            )  # use a set here so we dont re-query same host if multiple groups have same host
            for group in groups:
                condor_hosts_set.add(group.condor_central_manager)
            uncommitted_updates = 0
            for condor_host in condor_hosts_set:
                try:
                    condor_session = htcondor.Collector(condor_host)
                except Exception as exc:
                    logging.exception(
                        "Failed to locate condor daemon, skipping...:")
                    logging.error(exc)
                    continue

                master_type = htcondor.AdTypes.Master
                startd_type = htcondor.AdTypes.Startd

                # Query database for machines to be retired.
                abort_cycle = False
                for resource in db_session.query(Resource).filter(
                        Resource.condor_host == condor_host,
                        Resource.retire_request_time > Resource.retired_time):
                    logging.info("Retiring machine %s" % resource.name)
                    try:
                        condor_classad = condor_session.query(
                            master_type,
                            'Name=="%s"' % resource.name.split("@")[1])[0]
                        master_result = htcondor.send_command(
                            condor_classad,
                            htcondor.DaemonCommands.DaemonsOffPeaceful)

                        resource.retired_time = int(time.time())
                        db_session.merge(resource)
                        uncommitted_updates = uncommitted_updates + 1
                        if uncommitted_updates >= config.batch_commit_size:
                            try:
                                db_session.commit()
                                uncommitted_updates = 0
                            except Exception as exc:
                                logging.exception(
                                    "Failed to commit batch of retired machines, aborting cycle..."
                                )
                                logging.error(exc)
                                abort_cycle = True
                                break

                    except Exception as exc:
                        logging.exception(
                            "Failed to retire machine, rebooting command poller..."
                        )
                        logging.error(exc)
                        exit(1)

            if uncommitted_updates > 0:
                try:
                    db_session.commit()
                except Exception as exc:
                    logging.exception(
                        "Failed to commit retire machine, aborting cycle...")
                    logging.error(exc)
                    del condor_session
                    config.db_close()
                    del db_session
                    time.sleep(config.sleep_interval_command)
                    continue

            for condor_host in condor_hosts_set:
                # Query database for machines with no associated VM.
                master_list = []
                startd_list = []
                redundant_machine_list = db_session.query(
                    view_redundant_machines).filter(
                        view_redundant_machines.c.condor_host == condor_host)
                for resource in redundant_machine_list:
                    logging.info("Removing classads for machine %s" %
                                 resource.name)
                    try:
                        condor_classad = condor_session.query(
                            master_type,
                            'Name=="%s"' % resource.name.split("@")[1])[0]
                        master_list.append(condor_classad)

                        condor_classad = condor_session.query(
                            startd_type, 'Name=="%s"' % resource.name)[0]
                        startd_list.append(condor_classad)
                    except IndexError as exc:
                        pass
                    except Exception as exc:
                        logging.exception(
                            "Failed to retrieve machine classads, aborting...")
                        logging.error(exc)
                        abort_cycle = True
                        break

                if abort_cycle:
                    abort_cycle = False
                    continue

                # Execute condor_advertise to remove classads.
                if startd_list:
                    startd_advertise_result = condor_session.advertise(
                        startd_list, "INVALIDATE_STARTD_ADS")
                    logging.info("condor_advertise result for startd ads: %s",
                                 startd_advertise_result)

                if master_list:
                    master_advertise_result = condor_session.advertise(
                        master_list, "INVALIDATE_MASTER_ADS")
                    logging.info("condor_advertise result for master ads: %s",
                                 master_advertise_result)

            logging.info("Completed command consumer cycle")
            del condor_session
            config.db_close(commit=True)
            del db_session
            time.sleep(config.sleep_interval_command)

    except Exception as exc:
        logging.exception(
            "Command consumer while loop exception, process terminating...")
        logging.error(exc)
        config.db_close()
        del db_session