Exemplo n.º 1
0
    def test_win_update_with_given_weights(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_create_{}_{}".format(dim, dtype)
            is_created = bf.win_create(tensor, window_name)
            assert is_created, "bf.win_create do not create window object successfully."

            # Test simple average rule.
            weight = 1.0 / (len(bf.in_neighbor_ranks()) + 1)
            sync_result = bf.win_update(
                window_name,
                self_weight=weight,
                neighbor_weights={x: weight
                                  for x in bf.in_neighbor_ranks()})
            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update (weighted) produces wrong shape tensor.")
            assert (sync_result.data - rank).abs().max() < EPSILON, (
                "bf.win_update (weighted) produces wrong tensor value " +
                "[{0}-{1}]!={2} at rank {2}.".format(sync_result.min(),
                                                     sync_result.max(), rank))
Exemplo n.º 2
0
    def test_timeline_push_sum(self):
        # Use win_accumulate to simulate the push-sum algorithm (sync).
        outdegree = len(bf.out_neighbor_ranks())
        indegree = len(bf.in_neighbor_ranks())
        # we append the p at the last of data.
        x = torch.Tensor(
            [bf.rank() / (indegree + 1), 1.0 / bf.size() / (indegree + 1)])

        # Remember we do not create buffer with 0.
        bf.win_create(x, name="x_buff")
        x = bf.win_update_then_collect(name="x_buff")

        for _ in range(10):
            bf.win_accumulate(x,
                              name="x_buff",
                              dst_weights={
                                  rank: 1.0 / (outdegree + 1)
                                  for rank in bf.out_neighbor_ranks()
                              },
                              require_mutex=True)
            x.div_(1 + outdegree)
            x = bf.win_update_then_collect(name="x_buff")

        bf.barrier()
        # Do not forget to sync at last!
        x = bf.win_update_then_collect(name="x_buff")

        file_name = f"{self.temp_file}{bf.rank()}.json"
        with open(file_name, 'r') as tf:
            timeline_text = tf.read()
            assert 'MPI_WIN_ACCUMULATE' in timeline_text, timeline_text
            assert 'ENQUEUE_WIN_ACCUMULATE' in timeline_text, timeline_text

        bf.win_free()
Exemplo n.º 3
0
    def test_in_out_neighbors_expo2(self):
        bf.init()
        rank = bf.rank()
        size = bf.size()
        assert bf.set_topology(ExponentialGraph(size))
        in_neighobrs = bf.in_neighbor_ranks()
        out_neighbors = bf.out_neighbor_ranks()

        degree = int(np.ceil(np.log2(size)))
        expected_in_neighbors = sorted([(rank - 2**i) % size
                                        for i in range(degree)])
        expected_out_neighbors = sorted([(rank + 2**i) % size
                                         for i in range(degree)])
        assert sorted(in_neighobrs) == expected_in_neighbors
        assert sorted(out_neighbors) == expected_out_neighbors
Exemplo n.º 4
0
def test_infer_source_from_destination_ranks(topo_func):
    bf.init()
    size = bf.size()
    bf.set_topology(topo_func(size))
    topo = bf.load_topology()
    in_neighbors = bf.in_neighbor_ranks()
    out_neighbors = bf.out_neighbor_ranks()

    # Make the W into average rule.
    expected_W = (nx.to_numpy_array(topo) > 0).astype(float)
    expected_W /= expected_W.sum(axis=0)

    dst_ranks, W = InferSourceFromDestinationRanks(
        dst_ranks=out_neighbors, construct_adjacency_matrix=True)
    assert sorted(dst_ranks) == in_neighbors
    np.testing.assert_allclose(W, expected_W)
Exemplo n.º 5
0
    def test_in_out_neighbors_biring(self):
        bf.init()
        rank = bf.rank()
        size = bf.size()
        assert bf.set_topology(RingGraph(size))
        in_neighobrs = bf.in_neighbor_ranks()
        out_neighbors = bf.out_neighbor_ranks()

        expected_in_neighbors = list(
            set(map(lambda x: x % size, [rank - 1, rank + 1])))
        expected_out_neighbors = list(
            set(map(lambda x: x % size, [rank - 1, rank + 1])))

        if size <= 1:
            expected_in_neighbors = []
            expected_out_neighbors = []

        assert sorted(in_neighobrs) == expected_in_neighbors
        assert sorted(out_neighbors) == expected_out_neighbors
Exemplo n.º 6
0
            neighbor_weights = {
                r: 1 / (len(recv_neighbors) + 1)
                for r in recv_neighbors
            }
            self_weight = 1 / (len(recv_neighbors) + 1)

        x = bf.neighbor_allreduce(x,
                                  name='x',
                                  self_weight=self_weight,
                                  neighbor_weights=neighbor_weights,
                                  send_neighbors=send_neighbors,
                                  enable_topo_check=False)
        mse.append(torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2))
else:
    outdegree = len(bf.out_neighbor_ranks())
    indegree = len(bf.in_neighbor_ranks())

    if not bf.nccl_built():  # NCCL do not support associated P yet.
        bf.turn_on_win_ops_with_associated_p()
        bf.win_create(x, name="x", zero_init=True)
        for i in range(args.max_iters):
            if args.enable_dynamic_topology:
                num_out_neighbors = len(bf.out_neighbor_ranks())
                sent_neighbor = bf.out_neighbor_ranks()[i % num_out_neighbors]
                dst_weights = {sent_neighbor: 0.5}
                self_weight = 0.5
            else:
                dst_weights = {
                    rank: 1.0 / (outdegree + 1)
                    for rank in bf.out_neighbor_ranks()
                }
Exemplo n.º 7
0
def push_diging(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs):

    if loss == 'logistic_regression':
        rho = kwargs.get('rho', 1e-1)
    elif loss == 'linear_regression':
        rho = 0
    else:
        raise NotImplementedError(
            'Task not supported. This example only supports' +
            ' linear_regression and logistic_regression')

    outdegree = len(bf.out_neighbor_ranks())
    indegree = len(bf.in_neighbor_ranks())

    # We let w = col{u, y, v}, i.e., u, y, v = w[:n], w[n:2*n], w[2n]
    # Insteady of three directed_neighbor_allreduce operations for u, y,
    # and v respectively, we exploit one directed_neighbor_allreduce for
    # the combo vector w. This guarantees u, y, and v to be transmitted
    # simultanesly and avoids the mismatch between them. Experiments
    # show directed_neighbor_allreduce(w) is crutial for convergence of
    # push_diging.
    w = torch.zeros(2 * n + 1, 1).to(torch.double)
    x = torch.zeros(n, 1, dtype=torch.double, requires_grad=True)
    loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho)

    grad = x.grad.data.clone()
    w[n:2 * n] = grad
    x.grad.data.zero_()

    w[-1] = 1.0
    grad_prev = w[n:2 * n].clone()

    bf.win_create(w, name="w_buff", zero_init=True)

    mse = []
    for _ in range(maxite):
        bf.barrier()

        w[:n] = w[:n] - alpha * w[n:2 * n]
        bf.win_accumulate(w,
                          name="w_buff",
                          dst_weights={
                              rank: 1.0 / (outdegree * 2)
                              for rank in bf.out_neighbor_ranks()
                          },
                          require_mutex=True)
        w.div_(2)
        bf.barrier()

        w = bf.win_update_then_collect(name="w_buff")

        x.data = w[:n] / w[-1]
        loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho)
        grad = x.grad.data.clone()
        x.grad.data.zero_()

        w[n:2 * n] += grad - grad_prev
        grad_prev = grad
        if bf.rank() == 0:
            mse.append(torch.norm(x.data - w_opt, p=2))

    bf.barrier()
    w = bf.win_update_then_collect(name="w_buff")
    x.data = w[:n] / w[-1]

    return x, mse