Exemplo n.º 1
0
def remove_dedicate_vc(args):
    yarn_operator = YarnOperator(args.resource_manager_ip)
    vc_name = args.vc_name
    nodes = args.nodes
    remove_queue_flag = nodes is None

    logger.info("Unlabeling node...")
    nodes_info = yarn_operator.get_nodes_info()
    queues_info = yarn_operator.get_queues_info()
    if nodes is None:
        nodes = set(nodes_info.keys())
    t_nodes = [
        node for node in nodes if nodes_info[node]["nodeLabel"] == vc_name
    ]
    if len(t_nodes) > 0:

        if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \
                queues_info["default"]["capacity"]:
            queues_info["default"]["maxCapacity"] = 100.0

        removed_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0})
        for node, info in nodes_info.items():
            if node in nodes and info["nodeLabel"] == vc_name:
                removed_resource += info["resource"]

        default_partition_resource = get_resource_by_label(
            nodes_info)[""]["resource"]
        default_vc_percentage = queues_info["default"]["capacity"] / 100.0
        default_vc_resource = default_partition_resource * default_vc_percentage

        new_default_partition_resource = default_partition_resource + removed_resource
        new_default_vc_resource = default_vc_resource + removed_resource

        queues_info_with_gpus = convert_percentage_to_gpus(
            queues_info, default_partition_resource)
        queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus
        new_queues_percentage = convert_gpus_to_percentage(
            queues_info_with_gpus, new_default_partition_resource)
        new_queues_percentage = normalize_percentage(new_queues_percentage)
        updated_dict = {}
        for queue, info in new_queues_percentage.items():
            updated_dict[queue] = {
                "capacity": info["capacity"],
                "maximum-capacity": info["maxCapacity"]
            }

        yarn_operator.label_nodes(t_nodes, "")
        yarn_operator.update_queue_capacity(updated_dict)

    if remove_queue_flag:
        logger.info("Removing dedicated vc...")
        if vc_name not in queues_info:
            logger.warning("Virtual cluster not found: {}.".format(vc_name))
        else:
            yarn_operator.remove_dedicated_queue(vc_name)

        logger.info("Removing cluster label...")
        if vc_name not in yarn_operator.get_cluster_labels():
            logger.warning("Cluster label not found: {}".format(vc_name))
        else:
            yarn_operator.remove_cluster_label(vc_name)
Exemplo n.º 2
0
def add_dedicate_vc(args):
    yarn_operator = YarnOperator(args.resource_manager_ip)
    vc_name = args.vc_name
    nodes = args.nodes

    logger.info("Adding cluster label...")
    existing_labels = yarn_operator.get_cluster_labels()
    if vc_name in existing_labels:
        logger.warning("Label already exists: {}".format(vc_name))
    else:
        yarn_operator.add_cluster_label(vc_name)

    logger.info("Adding dedicated vc...")
    queues_info = yarn_operator.get_queues_info()
    if vc_name in queues_info:
        logger.warning(
            "Virtual cluster already exists: {}. Adding node to it".format(
                vc_name))
    else:
        yarn_operator.add_dedicated_queue(vc_name)

    nodes_info = yarn_operator.get_nodes_info()
    if len(nodes) > 0:
        logger.info("Labeling node...")

        if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \
                queues_info["default"]["capacity"]:
            queues_info["default"]["maxCapacity"] = 100.0

        added_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0})
        for node, info in nodes_info.items():
            if node in nodes and info["nodeLabel"] == "":
                added_resource += info["resource"]

        default_partition_resource = get_resource_by_label(
            nodes_info)[""]["resource"]
        default_vc_percentage = queues_info["default"]["capacity"] / 100.0
        default_vc_resource = default_partition_resource * default_vc_percentage

        if default_vc_resource.cpus < added_resource.cpus \
            or default_vc_resource.gpus < added_resource.gpus \
                or default_vc_resource.memory < added_resource.memory:
            logger.error(
                "Default vc resource isn't enough for the dedicated vc, please free some resource"
            )
            sys.exit(1)

        new_default_partition_resource = default_partition_resource - added_resource
        new_default_vc_resource = default_vc_resource - added_resource

        queues_info_with_gpus = convert_percentage_to_gpus(
            queues_info, default_partition_resource)
        queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus
        new_queues_percentage = convert_gpus_to_percentage(
            queues_info_with_gpus, new_default_partition_resource)
        new_queues_percentage = normalize_percentage(new_queues_percentage)
        updated_dict = {}
        for queue, info in new_queues_percentage.items():
            updated_dict[queue] = {
                "capacity": info["capacity"],
                "maximum-capacity": info["maxCapacity"]
            }
            if queue != "default":
                updated_dict[queue]["disable_preemption"] = True

        yarn_operator.label_nodes(nodes, vc_name)
        yarn_operator.update_queue_capacity(updated_dict)