예제 #1
0
def test_load_balancing():
    check_optimality(60, np.array([0.25, 0.25, 0.25, 0.25]), [15, 15, 15, 15])
    check_optimality(1024, np.array([0.3, 0.5, 0.9]), [0, 255, 769])
    check_optimality(60, np.array([0.44, 0.33, 0.22]), [42, 18, 0])
    check_optimality(60, np.array([0.55, 0.44, 0.40]), [35, 16, 9])
    check_optimality(1024 * 1024, np.array([0.3, 0.5, 0.9, 0.6]), [0, 169327, 602629, 276620])
    check_optimality(1024 * 1024, np.array([0.0, 0.5, 0.0, 0.6]), [0, 428963, 0, 619613])
    assert load_balance_peers(60, np.array([0.55, 0.44, 0.40]), min_size=10) == (41, 19, 0)
    assert load_balance_peers(60, np.array([0.32, 0.55, 0.44]), min_size=10) == (0, 40, 20)
    assert load_balance_peers(2, np.array([0.55, 0.20, 0.44]), min_size=10) == (1, 0, 1)
    assert load_balance_peers(1, np.array([0.55, 0.20, 0.44]), min_size=10) == (1, 0, 0)

    assert load_balance_peers(100, (None, None)) == (50, 50)
    assert load_balance_peers(100, (None, None, None, None, None)) == (20, 20, 20, 20, 20)
    assert load_balance_peers(100, (0, 0, 0, None, None)) == (0, 0, 0, 50, 50)

    with pytest.raises(AssertionError):
        load_balance_peers(100, (0, 0, 0))

    for i in range(10):
        vector_size = np.random.randint(1, 1024 ** 3)
        num_peers = np.random.randint(1, 256)
        scale = 1e-9 + np.random.rand() * 1e5
        throughputs = np.random.rand(num_peers) * scale + 1e-6
        min_size = np.random.choice([0, np.random.randint(0, vector_size // 10)])
        assignment = load_balance_peers(vector_size, throughputs, min_size)
        assert np.sum(assignment) == vector_size
        assert np.min(assignment) >= 0
예제 #2
0
def test_partitioning():
    for _ in range(100):
        tensors = []
        for _ in range(random.randint(1, 5)):
            ndim = random.randint(0, 4)
            shape = torch.Size([random.randint(0, 16) for _ in range(ndim)])
            make_tensor = random.choice(
                [torch.rand, torch.randn, torch.zeros, torch.ones])
            tensors.append(make_tensor(shape))

        total_size = sum(map(torch.Tensor.numel, tensors))
        if total_size == 0:
            continue
        num_chunks = random.randint(1, min(100,
                                           sum(x.numel() for x in tensors)))
        part_sizes = load_balance_peers(total_size, [None] * num_chunks)
        chunks = split_into_parts(tensors, part_sizes)
        assert len(chunks) == num_chunks
        shapes = [tensor.shape for tensor in tensors]
        restored = restore_from_parts(chunks, shapes)
        assert len(restored) == len(tensors)
        assert all(new.shape == old.shape
                   for new, old in zip(restored, tensors))
        assert all(
            torch.allclose(new, old) for new, old in zip(restored, tensors))
예제 #3
0
    async def leader_assemble_group(self) -> AllReduceRunner:
        """ Form up all current followers into a group and prepare to _run_allreduce """
        assert self.lock_looking_for_group.locked() and self.lock_request_join_group.locked()
        assert not self.assembled_group.done()
        group_id = DHTID.generate().to_bytes()
        ordered_group_endpoints = list(self.current_followers)
        ordered_group_endpoints.append(self.endpoint)
        random.shuffle(ordered_group_endpoints)

        throughputs, gathered = [], []
        for endpoint in ordered_group_endpoints:
            if endpoint == self.endpoint:
                throughputs.append(self.throughput)
                gathered.append(self.data_for_gather)
            else:
                follower_info = self.current_followers[endpoint]
                throughputs.append(follower_info.throughput if follower_info.throughput >= 0 else None)
                gathered.append(follower_info.gather if follower_info.gather else None)

        part_sizes = load_balance_peers(self.total_size, throughputs, self.min_vector_size)
        group_key_seed = random.randint(- 2 ** 31, 2 ** 31 - 1)

        logger.debug(f"{self.endpoint} - leader started allreduce for {len(ordered_group_endpoints)} peers.")
        allreduce_group = AllReduceRunner(group_id=group_id, tensors=self.averaged_tensors, endpoint=self.endpoint,
                                          ordered_group_endpoints=ordered_group_endpoints, part_sizes=part_sizes,
                                          gathered=gathered, group_key_seed=group_key_seed, **self.allreduce_kwargs)
        await self.group_key_manager.update_key_on_group_assembled(allreduce_group, is_leader=True)
        self.assembled_group.set_result(allreduce_group)
        return allreduce_group
예제 #4
0
def check_optimality(vector_size, throughputs, ref_partitions):
    partitions = list(load_balance_peers(vector_size, throughputs))
    assert get_cost(vector_size, partitions, throughputs) <= get_cost(
        vector_size, ref_partitions, throughputs)