def clone_dcalc(dcalc: DemandCalculator) -> DemandCalculator: scheduler_nodes = [ SchedulerNode(str(n.hostname), dict(n.resources)) for n in dcalc.get_compute_nodes() ] return new_demand_calculator( {}, node_mgr=dcalc.node_mgr, existing_nodes=scheduler_nodes )
def _new_dc(bindings: MockClusterBinding, existing_nodes: Optional[List[Node]] = None): existing_nodes = existing_nodes or [] return new_demand_calculator( config={"_mock_bindings": bindings}, existing_nodes=existing_nodes, node_history=NullNodeHistory(), singleton_lock=util.NullSingletonLock(), )
def clone_dcalc(dcalc: DemandCalculator) -> DemandCalculator: scheduler_nodes = [ SchedulerNode(str(n.name), dict(n.resources)) for n in dcalc.get_compute_nodes() ] if hasattr(dcalc.node_history, "conn"): conn = getattr(dcalc.node_history, "conn") conn.close() return new_demand_calculator({}, node_mgr=dcalc.node_mgr, existing_nodes=scheduler_nodes)
def target_counts_demand() -> None: """ TODO """ dcalc = new_demand_calculator(CONFIG) # # 100 cores dcalc.add_job( Job( "tc-10", { "node.nodearray": "htc", "ncpus": 1, "exclusive": False }, iterations=10, )) # 10 nodes dcalc.add_job( Job( "tn-10", { "node.nodearray": "htc", "ncpus": 4, "exclusive": True }, node_count=10, )) # 2 x 5 nodes, non-exclusive so a node from tc-10 can be reused dcalc.add_job( Job( "tn-2x5", { "node.nodearray": "htc", "ncpus": 2, "exclusive": True }, node_count=5, ), ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(["name", "job_ids", "nodearray", "ncpus", "*ncpus"], demand_result) assert len(demand_result.new_nodes) == 18
def scale_up() -> DemandCalculator: dcalc = new_demand_calculator(CONFIG) dcalc.add_job( Job("tc-100", {"node.nodearray": "htc", "ncpus": 1}, iterations=50) ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(columns, demand_result) dcalc.node_history.conn.close() return dcalc
def scale_down(dcalc: typing.Optional[DemandCalculator]) -> None: dcalc = dcalc or new_demand_calculator(CONFIG) dcalc.add_job( Job("tc-50", { "node.nodearray": "htc", "ncpus": 1 }, iterations=25)) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(columns, demand_result) print("The following nodes can be shutdown: {}".format(",".join( [n.name for n in demand_result.unmatched_nodes])))
def new_demand_calculator( config: Dict, pbs_env: Optional[PBSProEnvironment] = None, pbs_driver: Optional["PBSProDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, singleton_lock: Optional[SingletonLock] = None, ) -> DemandCalculator: if pbs_driver is None: pbs_driver = PBSProDriver(config) if pbs_env is None: pbs_env = envlib.from_driver(config, pbs_driver) if node_history is None: node_history = pbs_driver.new_node_history(config) # keep it as a config node_mgr = new_node_manager(config, existing_nodes=pbs_env.scheduler_nodes) pbs_driver.preprocess_node_mgr(config, node_mgr) singleton_lock = singleton_lock or pbs_driver.new_singleton_lock(config) assert singleton_lock demand_calculator = dcalclib.new_demand_calculator( config, node_mgr=node_mgr, node_history=node_history, node_queue=pbs_driver.new_node_queue(config), singleton_lock=singleton_lock, # it will handle the none case, existing_nodes=pbs_env.scheduler_nodes, ) ccnode_id_added = False for bucket in demand_calculator.node_mgr.get_buckets(): # ccnodeid will almost certainly not be defined. It just needs # to be definede once, so we will add a default for all nodes # the first time we see it is missingg if "ccnodeid" not in bucket.resources and not ccnode_id_added: hpc.autoscale.job.driver.add_ccnodeid_default_resource( demand_calculator.node_mgr) ccnode_id_added = True return demand_calculator
def auto(): CONFIG = json_load("/opt/cycle/scalelib/autoscale.json") MIN_CORE_COUNT = 4 WARM_BUFFER = 2 # Get hosts / tasks celery_d = celery_status() dcalc = demandcalculator.new_demand_calculator( CONFIG, existing_nodes=celery_d.scheduler_nodes, node_history=SQLiteNodeHistory()) dcalc.add_jobs(celery_d.jobs) n_jobs = len(celery_d.jobs) n_add_jobs = max(n_jobs + WARM_BUFFER, max(n_jobs, MIN_CORE_COUNT)) if n_add_jobs > 0: # RIGHT-SIZE based on Min Count and Buffer # It's possible that the padded jobs will float around extending the timer # but it seems like they're placed in some kind of normal order that's # preserved across autoscale runs print("add padding of %d jobs, to existing %d" % (n_add_jobs, n_jobs)) dcalc.add_jobs(job_buffer(n_add_jobs)) demand_result = dcalc.finish() output_columns = [ "name", "hostname", "job_ids", "required", "slots", "vm_size", "vcpu_count", "state" ] print_demand(output_columns, demand_result) dcalc.bootup() delete_result = dcalc.find_unmatched_for(at_least=180) if delete_result: try: dcalc.delete(delete_result) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e))
def target_counts_demand() -> None: """ Handle a mixture of 'target count' style allocation of ncpus and nodes via the DemandCalculator. """ dcalc = new_demand_calculator(CONFIG) # job requires 10 cores (ncpus) dcalc.add_job( Job( name="tc-10", constraints={"node.nodearray": "htc", "ncpus": 1, "exclusive": False}, iterations=10, ) ) # job requires 10 nodes with 4 cores (ncpus) dcalc.add_job( Job( name="tn-10", constraints={"node.nodearray": "htc", "ncpus": 4, "exclusive": True}, node_count=10, ) ) # 2 x 5 nodes dcalc.add_job( Job( name="tn-2x5", constraints={"node.nodearray": "htc", "ncpus": 2, "exclusive": True}, node_count=5, ), ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() # note that /ncpus will display available/total. ncpus will display the total, and # *ncpus will display available. print_demand(["name", "job_ids", "nodearray", "/ncpus"], demand_result)
def onprem_burst_demand() -> None: onprem001 = SchedulerNode("onprem001", resources={ "onprem": True, "nodetype": "A", "ncpus": 16 }) onprem002 = SchedulerNode("onprem002", resources={ "onprem": True, "nodetype": "A", "ncpus": 32 }) # onprem002 already has 10 cores occupied onprem002.available["ncpus"] -= 10 dcalc = new_demand_calculator(CONFIG, existing_nodes=[onprem001, onprem002]) dcalc.node_mgr.add_default_resource({"node.nodearray": ["htc", "htcspot"]}, "nodetype", "A") assert [b for b in dcalc.node_mgr.get_buckets() if b.nodearray == "htc"][0].resources["nodetype"] == "A" dcalc.node_mgr.add_default_resource({}, "nodetype", "B") assert [b for b in dcalc.node_mgr.get_buckets() if b.nodearray == "htc"][0].resources["nodetype"] == "A" # we want 50 ncpus, but there are only 38 onpremise, so we need to burst # 12 more cores. dcalc.add_job(Job("tc-100", {"nodetype": "A", "ncpus": 1}, iterations=50)) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() # also note we can add defaults to the column by adding a :, like # onprem:False, as this is only defined on the onprem nodes and not # on the Azure nodes. print_demand(["name", "job_ids", "nodetype", "onprem:False", "*ncpus"], demand_result)
def new_demand_calculator( config: Dict, ge_env: Optional[GridEngineEnvironment] = None, ge_driver: Optional["GridEngineDriver"] = None, ctx_handler: Optional[DefaultContextHandler] = None, node_history: Optional[NodeHistory] = None, singleton_lock: Optional[SingletonLock] = None, ) -> DemandCalculator: if ge_env is None: ge_env = envlib.from_qconf(config) if ge_driver is None: ge_driver = new_driver(config, ge_env) if node_history is None: db_path = config.get("nodehistorydb") if not db_path: db_dir = "/opt/cycle/jetpack/system/bootstrap/gridengine" if not os.path.exists(db_dir): db_dir = os.getcwd() db_path = os.path.join(db_dir, "nodehistory.db") read_only = config.get("read_only", False) node_history = SQLiteNodeHistory(db_path, read_only) node_history.create_timeout = config.get("boot_timeout", 3600) node_history.last_match_timeout = config.get("idle_timeout", 300) demand_calculator = dcalclib.new_demand_calculator( config, existing_nodes=ge_env.nodes, node_history=node_history, node_queue=ge_driver.new_node_queue(), singleton_lock=singleton_lock, # it will handle the none case ) for name, default_complex in ge_env.complexes.items(): if name == "slots": continue if default_complex.default is None: continue if not default_complex.requestable: continue logging.trace("Adding default resource %s=%s", name, default_complex.default) demand_calculator.node_mgr.add_default_resource( {}, name, default_complex.default) ccnode_id_added = False slots_added: Set[str] = set() for bucket in demand_calculator.node_mgr.get_buckets(): if "slots" not in bucket.resources and bucket.nodearray not in slots_added: default = ( '"default_resources": [{"select": {"node.nodearray": "%s"}, "name": "slots", "value": "node.vcpu_count"}]' % (bucket.nodearray)) demand_calculator.node_mgr.add_default_resource( selection={"node.nodearray": bucket.nodearray}, resource_name="slots", default_value="node.vcpu_count", ) logging.warning( """slots is not defined for bucket {}. Using the default, which you can add to your config: {}""" .format(bucket, default)) slots_added.add(bucket.nodearray) # ccnodeid will almost certainly not be defined. It just needs # to be definede once, so we will add a default for all nodes # the first time we see it is missingg if "ccnodeid" not in bucket.resources and not ccnode_id_added: demand_calculator.node_mgr.add_default_resource( selection={}, # applies to all nodes resource_name="ccnodeid", default_value=lambda n: n.delayed_node_id.node_id, ) ccnode_id_added = True return demand_calculator