예제 #1
0
def clone_dcalc(dcalc: DemandCalculator) -> DemandCalculator:
    scheduler_nodes = [
        SchedulerNode(str(n.hostname), dict(n.resources))
        for n in dcalc.get_compute_nodes()
    ]
    return new_demand_calculator(
        {}, node_mgr=dcalc.node_mgr, existing_nodes=scheduler_nodes
    )
예제 #2
0
def _new_dc(bindings: MockClusterBinding, existing_nodes: Optional[List[Node]] = None):

    existing_nodes = existing_nodes or []
    return new_demand_calculator(
        config={"_mock_bindings": bindings},
        existing_nodes=existing_nodes,
        node_history=NullNodeHistory(),
        singleton_lock=util.NullSingletonLock(),
    )
예제 #3
0
def clone_dcalc(dcalc: DemandCalculator) -> DemandCalculator:
    scheduler_nodes = [
        SchedulerNode(str(n.name), dict(n.resources))
        for n in dcalc.get_compute_nodes()
    ]
    if hasattr(dcalc.node_history, "conn"):
        conn = getattr(dcalc.node_history, "conn")
        conn.close()
    return new_demand_calculator({},
                                 node_mgr=dcalc.node_mgr,
                                 existing_nodes=scheduler_nodes)
예제 #4
0
def target_counts_demand() -> None:
    """
    TODO
    """
    dcalc = new_demand_calculator(CONFIG)

    # # 100 cores
    dcalc.add_job(
        Job(
            "tc-10",
            {
                "node.nodearray": "htc",
                "ncpus": 1,
                "exclusive": False
            },
            iterations=10,
        ))

    # 10 nodes
    dcalc.add_job(
        Job(
            "tn-10",
            {
                "node.nodearray": "htc",
                "ncpus": 4,
                "exclusive": True
            },
            node_count=10,
        ))

    # 2 x 5 nodes, non-exclusive so a node from tc-10 can be reused
    dcalc.add_job(
        Job(
            "tn-2x5",
            {
                "node.nodearray": "htc",
                "ncpus": 2,
                "exclusive": True
            },
            node_count=5,
        ), )

    demand_result = dcalc.finish()

    if not DRY_RUN:
        dcalc.bootup()

    print_demand(["name", "job_ids", "nodearray", "ncpus", "*ncpus"],
                 demand_result)

    assert len(demand_result.new_nodes) == 18
예제 #5
0
    def scale_up() -> DemandCalculator:
        dcalc = new_demand_calculator(CONFIG)

        dcalc.add_job(
            Job("tc-100", {"node.nodearray": "htc", "ncpus": 1}, iterations=50)
        )

        demand_result = dcalc.finish()

        if not DRY_RUN:
            dcalc.bootup()

        print_demand(columns, demand_result)

        dcalc.node_history.conn.close()

        return dcalc
예제 #6
0
    def scale_down(dcalc: typing.Optional[DemandCalculator]) -> None:
        dcalc = dcalc or new_demand_calculator(CONFIG)
        dcalc.add_job(
            Job("tc-50", {
                "node.nodearray": "htc",
                "ncpus": 1
            }, iterations=25))

        demand_result = dcalc.finish()

        if not DRY_RUN:
            dcalc.bootup()

        print_demand(columns, demand_result)

        print("The following nodes can be shutdown: {}".format(",".join(
            [n.name for n in demand_result.unmatched_nodes])))
예제 #7
0
def new_demand_calculator(
    config: Dict,
    pbs_env: Optional[PBSProEnvironment] = None,
    pbs_driver: Optional["PBSProDriver"] = None,
    ctx_handler: Optional[DefaultContextHandler] = None,
    node_history: Optional[NodeHistory] = None,
    singleton_lock: Optional[SingletonLock] = None,
) -> DemandCalculator:
    if pbs_driver is None:
        pbs_driver = PBSProDriver(config)

    if pbs_env is None:
        pbs_env = envlib.from_driver(config, pbs_driver)

    if node_history is None:
        node_history = pbs_driver.new_node_history(config)

    # keep it as a config
    node_mgr = new_node_manager(config, existing_nodes=pbs_env.scheduler_nodes)
    pbs_driver.preprocess_node_mgr(config, node_mgr)
    singleton_lock = singleton_lock or pbs_driver.new_singleton_lock(config)
    assert singleton_lock

    demand_calculator = dcalclib.new_demand_calculator(
        config,
        node_mgr=node_mgr,
        node_history=node_history,
        node_queue=pbs_driver.new_node_queue(config),
        singleton_lock=singleton_lock,  # it will handle the none case,
        existing_nodes=pbs_env.scheduler_nodes,
    )

    ccnode_id_added = False

    for bucket in demand_calculator.node_mgr.get_buckets():

        # ccnodeid will almost certainly not be defined. It just needs
        # to be definede once, so we will add a default for all nodes
        # the first time we see it is missingg
        if "ccnodeid" not in bucket.resources and not ccnode_id_added:
            hpc.autoscale.job.driver.add_ccnodeid_default_resource(
                demand_calculator.node_mgr)
            ccnode_id_added = True

    return demand_calculator
예제 #8
0
def auto():
    CONFIG = json_load("/opt/cycle/scalelib/autoscale.json")

    MIN_CORE_COUNT = 4
    WARM_BUFFER = 2

    # Get hosts / tasks
    celery_d = celery_status()

    dcalc = demandcalculator.new_demand_calculator(
        CONFIG,
        existing_nodes=celery_d.scheduler_nodes,
        node_history=SQLiteNodeHistory())

    dcalc.add_jobs(celery_d.jobs)
    n_jobs = len(celery_d.jobs)
    n_add_jobs = max(n_jobs + WARM_BUFFER, max(n_jobs, MIN_CORE_COUNT))
    if n_add_jobs > 0:
        # RIGHT-SIZE based on Min Count and Buffer
        # It's possible that the padded jobs will float around extending the timer
        # but it seems like they're placed in some kind of normal order that's
        # preserved across autoscale runs
        print("add padding of %d jobs, to existing %d" % (n_add_jobs, n_jobs))
        dcalc.add_jobs(job_buffer(n_add_jobs))

    demand_result = dcalc.finish()
    output_columns = [
        "name", "hostname", "job_ids", "required", "slots", "vm_size",
        "vcpu_count", "state"
    ]

    print_demand(output_columns, demand_result)
    dcalc.bootup()
    delete_result = dcalc.find_unmatched_for(at_least=180)
    if delete_result:
        try:
            dcalc.delete(delete_result)
        except Exception as e:
            _exit_code = 1
            logging.warning(
                "Deletion failed, will retry on next iteration: %s", e)
            logging.exception(str(e))
예제 #9
0
def target_counts_demand() -> None:
    """
    Handle a mixture of 'target count' style allocation of ncpus and nodes via the
    DemandCalculator.
    """
    dcalc = new_demand_calculator(CONFIG)

    # job requires 10 cores (ncpus)
    dcalc.add_job(
        Job(
            name="tc-10",
            constraints={"node.nodearray": "htc", "ncpus": 1, "exclusive": False},
            iterations=10,
        )
    )

    # job requires 10 nodes with 4 cores (ncpus)
    dcalc.add_job(
        Job(
            name="tn-10",
            constraints={"node.nodearray": "htc", "ncpus": 4, "exclusive": True},
            node_count=10,
        )
    )

    # 2 x 5 nodes
    dcalc.add_job(
        Job(
            name="tn-2x5",
            constraints={"node.nodearray": "htc", "ncpus": 2, "exclusive": True},
            node_count=5,
        ),
    )

    demand_result = dcalc.finish()

    if not DRY_RUN:
        dcalc.bootup()

    # note that /ncpus will display available/total. ncpus will display the total, and
    # *ncpus will display available.
    print_demand(["name", "job_ids", "nodearray", "/ncpus"], demand_result)
예제 #10
0
def onprem_burst_demand() -> None:
    onprem001 = SchedulerNode("onprem001",
                              resources={
                                  "onprem": True,
                                  "nodetype": "A",
                                  "ncpus": 16
                              })
    onprem002 = SchedulerNode("onprem002",
                              resources={
                                  "onprem": True,
                                  "nodetype": "A",
                                  "ncpus": 32
                              })

    # onprem002 already has 10 cores occupied
    onprem002.available["ncpus"] -= 10

    dcalc = new_demand_calculator(CONFIG,
                                  existing_nodes=[onprem001, onprem002])
    dcalc.node_mgr.add_default_resource({"node.nodearray": ["htc", "htcspot"]},
                                        "nodetype", "A")
    assert [b for b in dcalc.node_mgr.get_buckets()
            if b.nodearray == "htc"][0].resources["nodetype"] == "A"
    dcalc.node_mgr.add_default_resource({}, "nodetype", "B")

    assert [b for b in dcalc.node_mgr.get_buckets()
            if b.nodearray == "htc"][0].resources["nodetype"] == "A"
    # we want 50 ncpus, but there are only 38 onpremise, so we need to burst
    # 12 more cores.
    dcalc.add_job(Job("tc-100", {"nodetype": "A", "ncpus": 1}, iterations=50))

    demand_result = dcalc.finish()

    if not DRY_RUN:
        dcalc.bootup()

    # also note we can add defaults to the column by adding a :, like
    # onprem:False, as this is only defined on the onprem nodes and not
    # on the Azure nodes.
    print_demand(["name", "job_ids", "nodetype", "onprem:False", "*ncpus"],
                 demand_result)
예제 #11
0
def new_demand_calculator(
    config: Dict,
    ge_env: Optional[GridEngineEnvironment] = None,
    ge_driver: Optional["GridEngineDriver"] = None,
    ctx_handler: Optional[DefaultContextHandler] = None,
    node_history: Optional[NodeHistory] = None,
    singleton_lock: Optional[SingletonLock] = None,
) -> DemandCalculator:
    if ge_env is None:
        ge_env = envlib.from_qconf(config)

    if ge_driver is None:
        ge_driver = new_driver(config, ge_env)

    if node_history is None:
        db_path = config.get("nodehistorydb")
        if not db_path:
            db_dir = "/opt/cycle/jetpack/system/bootstrap/gridengine"
            if not os.path.exists(db_dir):
                db_dir = os.getcwd()
            db_path = os.path.join(db_dir, "nodehistory.db")

        read_only = config.get("read_only", False)
        node_history = SQLiteNodeHistory(db_path, read_only)

        node_history.create_timeout = config.get("boot_timeout", 3600)
        node_history.last_match_timeout = config.get("idle_timeout", 300)

    demand_calculator = dcalclib.new_demand_calculator(
        config,
        existing_nodes=ge_env.nodes,
        node_history=node_history,
        node_queue=ge_driver.new_node_queue(),
        singleton_lock=singleton_lock,  # it will handle the none case
    )

    for name, default_complex in ge_env.complexes.items():
        if name == "slots":
            continue

        if default_complex.default is None:
            continue

        if not default_complex.requestable:
            continue

        logging.trace("Adding default resource %s=%s", name,
                      default_complex.default)
        demand_calculator.node_mgr.add_default_resource(
            {}, name, default_complex.default)

    ccnode_id_added = False
    slots_added: Set[str] = set()

    for bucket in demand_calculator.node_mgr.get_buckets():
        if "slots" not in bucket.resources and bucket.nodearray not in slots_added:
            default = (
                '"default_resources": [{"select": {"node.nodearray": "%s"}, "name": "slots", "value": "node.vcpu_count"}]'
                % (bucket.nodearray))
            demand_calculator.node_mgr.add_default_resource(
                selection={"node.nodearray": bucket.nodearray},
                resource_name="slots",
                default_value="node.vcpu_count",
            )

            logging.warning(
                """slots is not defined for bucket {}. Using the default, which you can add to your config: {}"""
                .format(bucket, default))
            slots_added.add(bucket.nodearray)

        # ccnodeid will almost certainly not be defined. It just needs
        # to be definede once, so we will add a default for all nodes
        # the first time we see it is missingg
        if "ccnodeid" not in bucket.resources and not ccnode_id_added:
            demand_calculator.node_mgr.add_default_resource(
                selection={},  # applies to all nodes
                resource_name="ccnodeid",
                default_value=lambda n: n.delayed_node_id.node_id,
            )
            ccnode_id_added = True

    return demand_calculator