예제 #1
0
def read_schedulers(
    pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition]
) -> Dict[Hostname, PBSProScheduler]:
    parser = get_pbspro_parser()
    sched_dicts = pbscmd.qmgr_parsed("list", "sched")
    server_dicts = pbscmd.qmgr_parsed("list", "server")

    server_dicts_by_host = partition_single(server_dicts,
                                            lambda s: s["server_host"])

    ret: Dict[str, PBSProScheduler] = {}

    for sched_dict in sched_dicts:
        hostname = sched_dict["sched_host"]
        server_dict = server_dicts_by_host[hostname]

        for key, value in server_dict.items():
            if key not in sched_dict:
                sched_dict[key] = value

        # this is a scheduler, so it has no parent shared resources
        resource_state = parser.parse_resource_state(
            sched_dict, parent_shared_resources=None)
        scheduler = PBSProScheduler(sched_dict, resource_state)
        ret[scheduler.hostname] = scheduler

    return ret
예제 #2
0
 def __init__(self) -> None:
     clilib.CommonCLI.__init__(self, "pbspro")
     # bootstrap parser
     set_pbspro_parser(PBSProParser({}))
     self.pbscmd = PBSCMD(get_pbspro_parser())
     # lazily initialized
     self.__pbs_env: Optional[environment.PBSProEnvironment] = None
     self.__driver: Optional[PBSProDriver] = None
예제 #3
0
def read_resource_definitions(
        pbscmd: PBSCMD, config: Dict) -> Dict[str, "PBSProResourceDefinition"]:
    ret: Dict[str, PBSProResourceDefinition] = {}
    res_dicts = pbscmd.qmgr_parsed("list", "resource")

    res_names = set([x["name"] for x in res_dicts])

    # TODO I believe this is the only one, but leaving a config option
    # as a backup plan
    read_only = config.get("pbspro", {}).get("read_only_resources",
                                             ["host", "vnode"])

    def_sched = pbscmd.qmgr_parsed("list", "sched", "default")
    sched_priv = def_sched[0]["sched_priv"]
    sched_config = os.path.join(sched_priv, "sched_config")
    from pbspro.parser import PBSProParser

    parser = PBSProParser(config)
    sched_resources = parser.parse_resources_from_sched_priv(sched_config)

    missing_res = sched_resources - res_names
    missing_res_dicts = []
    for res_name in missing_res:
        try:
            missing_res_dicts.extend(
                pbscmd.qmgr_parsed("list", "resource", res_name))
        except CalledProcessError as e:
            logging.warning(
                "Could not find resource %s that was defined in %s, Ignoring",
                res_name,
                sched_config,
            )
            logging.fine(e)

    for rdict in res_dicts + missing_res_dicts:
        name = rdict["name"]
        res_type = RESOURCE_TYPES[rdict["type"]]
        flag: ResourceFlag = rdict.get("flag", "")  # type: ignore
        ret[name] = PBSProResourceDefinition(name, res_type, flag)
        if name in read_only:
            ret[name].read_only = True

    return ret
예제 #4
0
def parse_scheduler_nodes(
    config: Dict,
    pbscmd: PBSCMD,
    resource_definitions: Dict[str, PBSProResourceDefinition],
) -> List[Node]:
    """
    Gets the current state of the nodes as the scheduler sees them, including resources,
    assigned resources, jobs currently running etc.
    """
    ret: List[Node] = []
    ignore_onprem = config.get("pbspro", {}).get("ignore_onprem", False)
    ignore_hostnames_re_expr = config.get("pbspro",
                                          {}).get("ignore_hostnames_re")
    ignore_hostnames_re = None
    if ignore_hostnames_re_expr:
        try:
            ignore_hostnames_re = re.compile(ignore_hostnames_re_expr)
        except:
            logging.exception(
                f"Could not parse {ignore_hostnames_re_expr} as a regular expression"
            )
    ignored_hostnames = []

    for ndict in pbscmd.pbsnodes_parsed("-a"):
        if ignore_hostnames_re and ignore_hostnames_re.match(ndict["name"]):
            ignored_hostnames.append(ndict["name"])
            continue

        if ignore_onprem and ndict.get("resources_available.ccnodeid"):
            ignored_hostnames.append(ndict["name"])
            continue

        node = parse_scheduler_node(ndict, resource_definitions)

        if not node.available.get("ccnodeid"):
            node.metadata["override_resources"] = False
            logging.fine(
                "'ccnodeid' is not defined so %s has not been joined to the cluster by the autoscaler"
                + " yet or this is not a CycleCloud managed node",
                node,
            )
        ret.append(node)

    if ignored_hostnames:
        if len(ignored_hostnames) < 5:
            logging.info(
                f"Ignored {len(ignored_hostnames)} hostnames. {','.join(ignored_hostnames)}"
            )
        else:
            logging.info(
                f"Ignored {len(ignored_hostnames)} hostnames. {','.join(ignored_hostnames[:5])}..."
            )
    return ret
예제 #5
0
def list_queue_names(pbscmd: PBSCMD) -> List[str]:
    ret = []

    lines_less_header = pbscmd.qstat("-Q").splitlines()[1:]
    for line in lines_less_header:
        line = line.strip()

        if not line:
            continue

        if line.startswith("---"):
            continue

        qname = line.split()[0]
        ret.append(qname)

    return ret
예제 #6
0
def get_pbspro_parser() -> PBSProParser:
    global _PARSER
    if _PARSER is None:
        # avoid circular import
        from pbspro.pbscmd import PBSCMD
        from pbspro.resource import read_resource_definitions

        # chicken / egg issue: we want the  resource definitions
        # as a member of the parser, but we need the parser to parse
        # the definitions...
        # So create temp parser with no resource definitions
        _PARSER = PBSProParser({})
        pbscmd = PBSCMD(_PARSER)
        logging.warning("Using uninitialized PBSProParser: please call" +
                        " set_pbspro_parser before calling get_pbspro_parser")
        resource_definitions = read_resource_definitions(pbscmd, {})
        _PARSER = PBSProParser(resource_definitions)
    return _PARSER
예제 #7
0
 def __init__(
     self,
     config: Dict,
     pbscmd: Optional[PBSCMD] = None,
     resource_definitions: Optional[Dict[str,
                                         PBSProResourceDefinition]] = None,
     down_timeout: int = 300,
 ) -> None:
     super().__init__("pbspro")
     self.config = config
     self.pbscmd = pbscmd or PBSCMD(get_pbspro_parser())
     self.__queues: Optional[Dict[str, PBSProQueue]] = None
     self.__shared_resources: Optional[Dict[str, SharedResource]]
     self.__resource_definitions = resource_definitions
     self.__read_only_resources: Optional[Set[str]] = None
     self.__jobs_cache: Optional[List[Job]] = None
     self.__scheduler_nodes_cache: Optional[List[Node]] = None
     self.down_timeout = down_timeout
     self.down_timeout_td = datetime.timedelta(seconds=self.down_timeout)
예제 #8
0
def read_queues(
    config: Dict,
    pbscmd: PBSCMD,
    resource_definitions: Dict[str, PBSProResourceDefinition],
    scheduler_shared_resources: Dict[str, conslib.SharedResource],
) -> Dict[str, PBSProQueue]:
    parser = get_pbspro_parser()

    ret: Dict[str, PBSProQueue] = {}
    qnames = list_queue_names(pbscmd)
    queue_dicts = pbscmd.qmgr_parsed("list", "queue", ",".join(qnames))

    # queue resources will include things like ncpus - i.e. the total amount of ncpus etc
    # They are meaningless as a shared constraint, they are only there for info purposes
    ignore_queues = config.get("pbspro", {}).get("ignore_queues", [])

    for qdict in queue_dicts:
        state_count = parser.parse_state_counts(qdict["state_count"])

        resource_state = parser.parse_resource_state(
            qdict, scheduler_shared_resources)

        queue = PBSProQueue(
            name=qdict["name"],
            queue_type=qdict["queue_type"],
            node_group_key=qdict.get("node_group_key"),
            node_group_enable=qdict.get("node_group_enable",
                                        "").lower() == "true",
            total_jobs=int(qdict["total_jobs"]),
            state_count=state_count,
            resource_state=resource_state,
            resources_default=parser.parse_resources_default(qdict),
            default_chunk=parser.parse_default_chunk(qdict),
            resource_definitions=resource_definitions,
            enabled=qdict["enabled"].lower() == "true"
            and qdict["name"] not in ignore_queues,
            started=qdict["started"].lower() == "true",
        )
        ret[queue.name] = queue

    return ret
예제 #9
0
def parse_jobs(
    pbscmd: PBSCMD,
    resource_definitions: Dict[str, PBSProResourceDefinition],
    queues: Dict[str, PBSProQueue],
    resources_for_scheduling: Set[str],
) -> List[Job]:
    """
    Parses PBS qstat output and creates relevant hpc.autoscale.job.job.Job objects
    """
    parser = get_pbspro_parser()
    # alternate format triggered by
    # -a, -i, -G, -H, -M, -n, -r, -s, -T, or -u
    ret: List[Job] = []

    response: Dict = pbscmd.qstat_json("-f", "-t")

    for job_id, jdict in response.get("Jobs", {}).items():
        job_id = job_id.split(".")[0]

        job_state = jdict.get("job_state")
        if not job_state:
            logging.warning("No job_state defined for job %s. Skipping",
                            job_id)
            continue

        if job_state != PBSProJobStates.Queued:
            continue

        # ensure we don't autoscale jobs from disabled or non-started queues
        qname = jdict.get("queue")
        if not qname or qname not in queues:
            logging.warning("queue was not defined for job %s: ignoring",
                            job_id)
            continue

        queue: PBSProQueue = queues[qname]
        if not queue.enabled:
            logging.fine("Skipping job %s from disabled queue %s", job_id,
                         qname)
            continue

        if not queue.started:
            logging.fine("Skipping job %s from non-started queue %s", job_id,
                         qname)
            continue

        # handle array vs individual jobs
        if jdict.get("array"):
            iterations = parser.parse_range_size(
                jdict["array_indices_submitted"])
            remaining = parser.parse_range_size(
                jdict["array_indices_remaining"])
        elif "[" in job_id:
            continue
        else:
            iterations = 1
            remaining = 1

        res_list = jdict["Resource_List"]
        res_list["schedselect"] = jdict["schedselect"]
        rdict = parser.convert_resource_list(res_list)

        pack = (PackingStrategy.PACK if rdict["place"]["arrangement"]
                in ["free", "pack"] else PackingStrategy.SCATTER)

        # SMP style jobs
        is_smp = (rdict["place"].get("grouping") == "host"
                  or rdict["place"]["arrangement"] == "pack")

        # pack jobs do not need to define node_count

        node_count = int(rdict.get("nodect", "0"))

        smp_multiplier = 1

        if is_smp:
            smp_multiplier = max(1, iterations) * max(1, node_count)
            # for key, value in list(rdict.items()):
            #     if isinstance(value, (float, int)):
            #         value = value * smp_multiplier
            iterations = node_count = 1

        effective_node_count = max(node_count, 1)

        # htc jobs set ungrouped=true. see our default htcq
        colocated = (not is_smp and queue.uses_placement
                     and rdict.get("ungrouped", "false").lower() == "false")

        sharing = rdict["place"].get("sharing")

        for n, chunk_base in enumerate(rdict["schedselect"]):

            chunk: Dict[str, Any] = {}

            chunk.update(rdict)

            if "ncpus" not in chunk_base:
                chunk["ncpus"] = chunk["ncpus"] // effective_node_count

            if smp_multiplier > 1:
                for key, value in list(chunk_base.items()):
                    if isinstance(value, (int, float)):
                        chunk_base[key] = value * smp_multiplier
            # do this _after_ rdict, since the chunks
            # will override the top level resources
            # e.g. notice that ncpus=4. This will be the rdict value
            # but the chunks have ncpus=2
            # Resource_List.ncpus = 4
            # Resource_List.nodect = 2
            # Resource_List.select = 2:ncpus=2

            chunk.update(chunk_base)
            working_constraint: Dict[str, Any] = {}
            constraints = [working_constraint]

            if colocated:
                working_constraint["in-a-placement-group"] = True

            my_job_id = job_id
            if len(rdict["schedselect"]) > 1:
                if "." in job_id:
                    job_index, host = job_id.split(".", 1)
                    my_job_id = "{}+{}.{}".format(job_index, n, host)
                else:
                    my_job_id = "{}+{}".format(job_id, n)

            if sharing == "excl":
                working_constraint["exclusive-task"] = True
            elif sharing == "exclhost":
                working_constraint["exclusive"] = True

            job_resources = {}

            for rname, rvalue in chunk.items():
                if rname in ["select", "schedselect", "place", "nodect"]:
                    continue

                if rname not in resources_for_scheduling:
                    if rname == "skipcyclesubhook":
                        continue
                    logging.warning(
                        "Ignoring resource %s as it was not defined in sched_config",
                        rname,
                    )
                    continue

                # add all resource requests here. By that, I mean
                # non resource requests, like exclusive, should be ignored
                # required for get_non_host_constraints
                job_resources[rname] = rvalue

                resource_def = resource_definitions.get(rname)

                # constraints are for the node/host
                # queue/scheduler level ones will be added using
                # > queue.get_non_host_constraints(job_resource)
                if not resource_def or not resource_def.is_host:
                    continue

                if rname not in working_constraint:
                    working_constraint[rname] = rvalue
                else:
                    # hit a conflict, so start a new working cons
                    # so we maintain precedence
                    working_constraint = {rname: rvalue}
                    constraints.append(working_constraint)

            queue_constraints = queue.get_non_host_constraints(job_resources)
            constraints.extend(queue_constraints)

            job = Job(
                name=my_job_id,
                constraints=constraints,
                iterations=iterations,
                node_count=node_count,
                colocated=colocated,
                packing_strategy=pack,
            )
            job.iterations_remaining = remaining
            ret.append(job)

    return ret
예제 #10
0
    def _initialize(self, command: str, config: Dict) -> None:

        resource_definitions = read_resource_definitions(self.pbscmd, config)
        set_pbspro_parser(PBSProParser(resource_definitions))
        self.pbscmd = PBSCMD(get_pbspro_parser())