def target_counts_demand() -> None: """ TODO """ dcalc = new_demand_calculator(CONFIG) # # 100 cores dcalc.add_job( Job( "tc-10", { "node.nodearray": "htc", "ncpus": 1, "exclusive": False }, iterations=10, )) # 10 nodes dcalc.add_job( Job( "tn-10", { "node.nodearray": "htc", "ncpus": 4, "exclusive": True }, node_count=10, )) # 2 x 5 nodes, non-exclusive so a node from tc-10 can be reused dcalc.add_job( Job( "tn-2x5", { "node.nodearray": "htc", "ncpus": 2, "exclusive": True }, node_count=5, ), ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(["name", "job_ids", "nodearray", "ncpus", "*ncpus"], demand_result) assert len(demand_result.new_nodes) == 18
def print_demand( config: Dict, demand_result: DemandResult, output_columns: Optional[List[str]] = None, output_format: Optional[str] = None, log: bool = False, ) -> None: # and let's use the demand printer to print the demand_result. for node in demand_result.matched_nodes + demand_result.unmatched_nodes: # ignore @allhosts - every node will eventually end up there. node.available["hostgroups"] = [ x for x in get_node_hostgroups(config, node) if x != "@allhosts" ] node._resources["hostgroups"] = node.available["hostgroups"] if not output_columns: output_columns = config.get( "output_columns", [ "name", "hostname", "job_ids", "hostgroups", "exists", "required", "managed", "slots", "*slots", "vm_size", "memory", "vcpu_count", "state", "placement_group", "create_time_remaining", "idle_time_remaining", ], ) if "all" in output_columns: # type: ignore output_columns = [] output_format = output_format or "table" demandprinter.print_demand( output_columns, demand_result, output_format=output_format, log=log, ) return demand_result
def _print_demand(output_format: OutputFormat) -> str: stream = io.StringIO() node = SchedulerNode("tux", {"ncpus": 2, "mem": Memory.value_of("1.0g")}) node.available["ncpus"] = 1 node.assign("11") node.assign("12") result = DemandResult([], [node], [], []) print_demand( ["hostname", "job_ids", "ncpus", "*ncpus", "mem"], result, stream=stream, output_format=output_format, ) return stream.getvalue()
def common_cluster_test(qsub_commands: List[str], pg_counts: Optional[Dict[str, int]] = None, previous_dcalc: Optional[DemandCalculator] = None, **array_counts: int) -> DemandCalculator: pg_counts = pg_counts or {} dcalc = common_cluster(qsub_commands, previous_dcalc) demand = dcalc.get_demand() demandprinter.print_demand(["name", "job_ids", "placement_group"], demand) # sanity check that we don't recreate the same node partition_single(demand.new_nodes, lambda n: n.name) by_array = partition(demand.new_nodes, lambda n: n.nodearray) by_pg = partition(demand.new_nodes, lambda n: n.placement_group) if set(by_pg.keys()) != set([None]): if set(by_pg.keys()) != set(pg_counts.keys()): assert False, "\n%s\n%s" % ( [(x, len(y)) for x, y in by_pg.items()], pg_counts, ) assert set(by_pg.keys()) == set(pg_counts.keys()) assert not (bool(by_pg) ^ bool(pg_counts)) if pg_counts: for pg_name, count in pg_counts.items(): assert pg_name in by_pg assert ( len(by_pg[pg_name]) == count ), "Expected pg {} to have {} nodes. Found {}. Full {}".format( pg_name, count, len(by_pg[pg_name]), [(x, len(y)) for x, y in by_pg.items()], ) for pg_name in by_pg: assert pg_name in pg_counts for nodearray_name, count in array_counts.items(): assert nodearray_name in by_array assert len(by_array[nodearray_name]) == count, [ n.name for n in by_array[nodearray_name] ] for nodearray_name, node_list in by_array.items(): assert nodearray_name in array_counts return dcalc
def scale_up() -> DemandCalculator: dcalc = new_demand_calculator(CONFIG) dcalc.add_job( Job("tc-100", {"node.nodearray": "htc", "ncpus": 1}, iterations=50) ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(columns, demand_result) dcalc.node_history.conn.close() return dcalc
def scale_down(dcalc: typing.Optional[DemandCalculator]) -> None: dcalc = dcalc or new_demand_calculator(CONFIG) dcalc.add_job( Job("tc-50", { "node.nodearray": "htc", "ncpus": 1 }, iterations=25)) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(columns, demand_result) print("The following nodes can be shutdown: {}".format(",".join( [n.name for n in demand_result.unmatched_nodes])))
def print_demand( config: Dict, demand_result: DemandResult, output_columns: Optional[List[str]] = None, output_format: Optional[str] = None, log: bool = False, ) -> None: # and let's use the demand printer to print the demand_result. if not output_columns: output_columns = config.get( "output_columns", [ "name", "hostname", "job_ids", "*hostgroups", "exists", "required", "managed", "slots", "*slots", "vm_size", "memory", "vcpu_count", "state", "placement_group", "create_time_remaining", "idle_time_remaining", ], ) if "all" in output_columns: # type: ignore output_columns = [] output_format = output_format or "table" demandprinter.print_demand( output_columns, demand_result, output_format=output_format, log=log, ) return demand_result
def auto(): CONFIG = json_load("/opt/cycle/scalelib/autoscale.json") MIN_CORE_COUNT = 4 WARM_BUFFER = 2 # Get hosts / tasks celery_d = celery_status() dcalc = demandcalculator.new_demand_calculator( CONFIG, existing_nodes=celery_d.scheduler_nodes, node_history=SQLiteNodeHistory()) dcalc.add_jobs(celery_d.jobs) n_jobs = len(celery_d.jobs) n_add_jobs = max(n_jobs + WARM_BUFFER, max(n_jobs, MIN_CORE_COUNT)) if n_add_jobs > 0: # RIGHT-SIZE based on Min Count and Buffer # It's possible that the padded jobs will float around extending the timer # but it seems like they're placed in some kind of normal order that's # preserved across autoscale runs print("add padding of %d jobs, to existing %d" % (n_add_jobs, n_jobs)) dcalc.add_jobs(job_buffer(n_add_jobs)) demand_result = dcalc.finish() output_columns = [ "name", "hostname", "job_ids", "required", "slots", "vm_size", "vcpu_count", "state" ] print_demand(output_columns, demand_result) dcalc.bootup() delete_result = dcalc.find_unmatched_for(at_least=180) if delete_result: try: dcalc.delete(delete_result) except Exception as e: _exit_code = 1 logging.warning( "Deletion failed, will retry on next iteration: %s", e) logging.exception(str(e))
def target_counts_demand() -> None: """ Handle a mixture of 'target count' style allocation of ncpus and nodes via the DemandCalculator. """ dcalc = new_demand_calculator(CONFIG) # job requires 10 cores (ncpus) dcalc.add_job( Job( name="tc-10", constraints={"node.nodearray": "htc", "ncpus": 1, "exclusive": False}, iterations=10, ) ) # job requires 10 nodes with 4 cores (ncpus) dcalc.add_job( Job( name="tn-10", constraints={"node.nodearray": "htc", "ncpus": 4, "exclusive": True}, node_count=10, ) ) # 2 x 5 nodes dcalc.add_job( Job( name="tn-2x5", constraints={"node.nodearray": "htc", "ncpus": 2, "exclusive": True}, node_count=5, ), ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() # note that /ncpus will display available/total. ncpus will display the total, and # *ncpus will display available. print_demand(["name", "job_ids", "nodearray", "/ncpus"], demand_result)
def onprem_burst_demand() -> None: onprem001 = SchedulerNode("onprem001", resources={ "onprem": True, "nodetype": "A", "ncpus": 16 }) onprem002 = SchedulerNode("onprem002", resources={ "onprem": True, "nodetype": "A", "ncpus": 32 }) # onprem002 already has 10 cores occupied onprem002.available["ncpus"] -= 10 dcalc = new_demand_calculator(CONFIG, existing_nodes=[onprem001, onprem002]) dcalc.node_mgr.add_default_resource({"node.nodearray": ["htc", "htcspot"]}, "nodetype", "A") assert [b for b in dcalc.node_mgr.get_buckets() if b.nodearray == "htc"][0].resources["nodetype"] == "A" dcalc.node_mgr.add_default_resource({}, "nodetype", "B") assert [b for b in dcalc.node_mgr.get_buckets() if b.nodearray == "htc"][0].resources["nodetype"] == "A" # we want 50 ncpus, but there are only 38 onpremise, so we need to burst # 12 more cores. dcalc.add_job(Job("tc-100", {"nodetype": "A", "ncpus": 1}, iterations=50)) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() # also note we can add defaults to the column by adding a :, like # onprem:False, as this is only defined on the onprem nodes and not # on the Azure nodes. print_demand(["name", "job_ids", "nodetype", "onprem:False", "*ncpus"], demand_result)