示例#1
0
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30):
    request_resources(num_cpus=42)

    # Disable event clearing for test.
    monitor.event_summarizer.clear = lambda *a: None

    while True:
        monitor.update_load_metrics()
        monitor.update_resource_requests()
        monitor.update_event_summary()
        resource_usage = monitor.load_metrics._get_resource_usage()

        # Check resource request propagation.
        req = monitor.load_metrics.resource_requests
        assert req == [{"CPU": 1}] * 42, req

        if "memory" in resource_usage[0]:
            del resource_usage[0]["memory"]
        if "object_store_memory" in resource_usage[1]:
            del resource_usage[0]["object_store_memory"]
        if "memory" in resource_usage[1]:
            del resource_usage[1]["memory"]
        if "object_store_memory" in resource_usage[1]:
            del resource_usage[1]["object_store_memory"]
        for key in list(resource_usage[0].keys()):
            if key.startswith("node:"):
                del resource_usage[0][key]
        for key in list(resource_usage[1].keys()):
            if key.startswith("node:"):
                del resource_usage[1][key]

        if expected_resource_usage is None:
            if all(x for x in resource_usage[0:]):
                break
        elif all(x == y
                 for x, y in zip(resource_usage, expected_resource_usage)):
            break
        else:
            timeout -= 1
            time.sleep(1)

        if timeout <= 0:
            raise ValueError("Timeout. {} != {}".format(
                resource_usage, expected_resource_usage))

    # Sanity check we emitted a resize event.
    assert any("Resized to" in x for x in monitor.event_summarizer.summary())

    return resource_usage
示例#2
0
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30):
    request_resources(num_cpus=42)

    while True:
        monitor.update_load_metrics()
        monitor.update_resource_requests()
        resource_usage = monitor.load_metrics._get_resource_usage()

        # Check resource request propagation.
        req = monitor.load_metrics.resource_requests
        assert req == [{"CPU": 1}] * 42, req

        if "memory" in resource_usage[0]:
            del resource_usage[0]["memory"]
        if "object_store_memory" in resource_usage[1]:
            del resource_usage[0]["object_store_memory"]
        if "memory" in resource_usage[1]:
            del resource_usage[1]["memory"]
        if "object_store_memory" in resource_usage[1]:
            del resource_usage[1]["object_store_memory"]
        for key in list(resource_usage[0].keys()):
            if key.startswith("node:"):
                del resource_usage[0][key]
        for key in list(resource_usage[1].keys()):
            if key.startswith("node:"):
                del resource_usage[1][key]

        if expected_resource_usage is None:
            if all(x for x in resource_usage[0:]):
                break
        elif all(x == y
                 for x, y in zip(resource_usage, expected_resource_usage)):
            break
        else:
            timeout -= 1
            time.sleep(1)

        if timeout <= 0:
            raise ValueError("Timeout. {} != {}".format(
                resource_usage, expected_resource_usage))

    return resource_usage
示例#3
0
    async def expand_cluster(self, current_workers, allocation):
        logging.info("Attempting to expand cluster to "
                     f"{len(allocation)} nodes")
        nodes = self.get_nodes()
        node_ips = {node["NodeManagerAddress"] for node in nodes}
        invalid_workers = {
            worker_id: ip
            for worker_id, ip in current_workers.items() if ip not in node_ips
        }
        if len(invalid_workers) == len(current_workers):
            rescale_timeout = FULL_RESCALE_TIMEOUT
            logging.info("No live workers found. "
                         "Waiting longer than specified for rescaling.")
        else:
            rescale_timeout = self._rescale_timeout

        worker_resources = [
            copy.deepcopy(self._worker_resources)
            for _ in range(len(allocation) + len(invalid_workers))
        ]
        for bundle in worker_resources:
            bundle["CPU"] += 0.1
        sdk.request_resources(bundles=worker_resources)
        waited = 0.0
        logging.info(f"Waiting for up to {rescale_timeout} seconds for "
                     "nodes to be ready")
        while (waited < rescale_timeout
               and not self._cluster_ready(allocation)[0]
               and not self._force_immediate_allocation.is_set()):
            await asyncio.sleep(1.0)
            waited += 1.0
        ready, nodes = self._cluster_ready(allocation)
        logging.info(f"Found {nodes} available nodes")
        if not ready:
            allocation = (
                [node
                 for node in allocation if "adaptdl_virtual" not in node] +
                [node for node in allocation if "adaptdl_virtual" in node])
            return allocation[:nodes]
        else:
            return allocation
示例#4
0
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30):
    request_resources(num_cpus=42)

    # add placement groups.
    pg_demands = [{"GPU": 2}, {"extra_resource": 2}]
    strategy = "STRICT_PACK"
    pg = placement_group(pg_demands, strategy=strategy)
    pg.ready()
    time.sleep(2)  # wait for placemnt groups to propogate.

    # Disable event clearing for test.
    monitor.event_summarizer.clear = lambda *a: None

    visited_atleast_once = [set(), set()]
    while True:
        monitor.update_load_metrics()
        monitor.update_resource_requests()
        monitor.update_event_summary()
        resource_usage = monitor.load_metrics._get_resource_usage()

        # Check resource request propagation.
        req = monitor.load_metrics.resource_requests
        assert req == [{"CPU": 1}] * 42, req

        pg_response_data = monitor.load_metrics.pending_placement_groups
        assert_correct_pg(pg_response_data, pg_demands, strategy)

        if "memory" in resource_usage[0]:
            del resource_usage[0]["memory"]
            visited_atleast_once[0].add("memory")
        if "object_store_memory" in resource_usage[0]:
            del resource_usage[0]["object_store_memory"]
            visited_atleast_once[0].add("object_store_memory")
        if "memory" in resource_usage[1]:
            del resource_usage[1]["memory"]
            visited_atleast_once[1].add("memory")
        if "object_store_memory" in resource_usage[1]:
            del resource_usage[1]["object_store_memory"]
            visited_atleast_once[1].add("object_store_memory")
        for key in list(resource_usage[0].keys()):
            if key.startswith("node:"):
                del resource_usage[0][key]
                visited_atleast_once[0].add("node:")
        for key in list(resource_usage[1].keys()):
            if key.startswith("node:"):
                del resource_usage[1][key]
                visited_atleast_once[1].add("node:")
        if expected_resource_usage is None:
            if all(x for x in resource_usage[0:]):
                break
        elif all(x == y
                 for x, y in zip(resource_usage, expected_resource_usage)):
            break
        else:
            timeout -= 1
            time.sleep(1)

        if timeout <= 0:
            raise ValueError("Timeout. {} != {}".format(
                resource_usage, expected_resource_usage))

    # Sanity check we emitted a resize event.
    assert any("Resized to" in x for x in monitor.event_summarizer.summary())

    assert visited_atleast_once[0] == {
        "memory", "object_store_memory", "node:"
    }
    assert visited_atleast_once[0] == visited_atleast_once[1]

    remove_placement_group(pg)

    return resource_usage