def test_win_update_with_given_weights(self): size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_create_{}_{}".format(dim, dtype) is_created = bf.win_create(tensor, window_name) assert is_created, "bf.win_create do not create window object successfully." # Test simple average rule. weight = 1.0 / (len(bf.in_neighbor_ranks()) + 1) sync_result = bf.win_update( window_name, self_weight=weight, neighbor_weights={x: weight for x in bf.in_neighbor_ranks()}) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update (weighted) produces wrong shape tensor.") assert (sync_result.data - rank).abs().max() < EPSILON, ( "bf.win_update (weighted) produces wrong tensor value " + "[{0}-{1}]!={2} at rank {2}.".format(sync_result.min(), sync_result.max(), rank))
def test_timeline_push_sum(self): # Use win_accumulate to simulate the push-sum algorithm (sync). outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) # we append the p at the last of data. x = torch.Tensor( [bf.rank() / (indegree + 1), 1.0 / bf.size() / (indegree + 1)]) # Remember we do not create buffer with 0. bf.win_create(x, name="x_buff") x = bf.win_update_then_collect(name="x_buff") for _ in range(10): bf.win_accumulate(x, name="x_buff", dst_weights={ rank: 1.0 / (outdegree + 1) for rank in bf.out_neighbor_ranks() }, require_mutex=True) x.div_(1 + outdegree) x = bf.win_update_then_collect(name="x_buff") bf.barrier() # Do not forget to sync at last! x = bf.win_update_then_collect(name="x_buff") file_name = f"{self.temp_file}{bf.rank()}.json" with open(file_name, 'r') as tf: timeline_text = tf.read() assert 'MPI_WIN_ACCUMULATE' in timeline_text, timeline_text assert 'ENQUEUE_WIN_ACCUMULATE' in timeline_text, timeline_text bf.win_free()
def test_in_out_neighbors_expo2(self): bf.init() rank = bf.rank() size = bf.size() assert bf.set_topology(ExponentialGraph(size)) in_neighobrs = bf.in_neighbor_ranks() out_neighbors = bf.out_neighbor_ranks() degree = int(np.ceil(np.log2(size))) expected_in_neighbors = sorted([(rank - 2**i) % size for i in range(degree)]) expected_out_neighbors = sorted([(rank + 2**i) % size for i in range(degree)]) assert sorted(in_neighobrs) == expected_in_neighbors assert sorted(out_neighbors) == expected_out_neighbors
def test_infer_source_from_destination_ranks(topo_func): bf.init() size = bf.size() bf.set_topology(topo_func(size)) topo = bf.load_topology() in_neighbors = bf.in_neighbor_ranks() out_neighbors = bf.out_neighbor_ranks() # Make the W into average rule. expected_W = (nx.to_numpy_array(topo) > 0).astype(float) expected_W /= expected_W.sum(axis=0) dst_ranks, W = InferSourceFromDestinationRanks( dst_ranks=out_neighbors, construct_adjacency_matrix=True) assert sorted(dst_ranks) == in_neighbors np.testing.assert_allclose(W, expected_W)
def test_in_out_neighbors_biring(self): bf.init() rank = bf.rank() size = bf.size() assert bf.set_topology(RingGraph(size)) in_neighobrs = bf.in_neighbor_ranks() out_neighbors = bf.out_neighbor_ranks() expected_in_neighbors = list( set(map(lambda x: x % size, [rank - 1, rank + 1]))) expected_out_neighbors = list( set(map(lambda x: x % size, [rank - 1, rank + 1]))) if size <= 1: expected_in_neighbors = [] expected_out_neighbors = [] assert sorted(in_neighobrs) == expected_in_neighbors assert sorted(out_neighbors) == expected_out_neighbors
neighbor_weights = { r: 1 / (len(recv_neighbors) + 1) for r in recv_neighbors } self_weight = 1 / (len(recv_neighbors) + 1) x = bf.neighbor_allreduce(x, name='x', self_weight=self_weight, neighbor_weights=neighbor_weights, send_neighbors=send_neighbors, enable_topo_check=False) mse.append(torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)) else: outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) if not bf.nccl_built(): # NCCL do not support associated P yet. bf.turn_on_win_ops_with_associated_p() bf.win_create(x, name="x", zero_init=True) for i in range(args.max_iters): if args.enable_dynamic_topology: num_out_neighbors = len(bf.out_neighbor_ranks()) sent_neighbor = bf.out_neighbor_ranks()[i % num_out_neighbors] dst_weights = {sent_neighbor: 0.5} self_weight = 0.5 else: dst_weights = { rank: 1.0 / (outdegree + 1) for rank in bf.out_neighbor_ranks() }
def push_diging(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) # We let w = col{u, y, v}, i.e., u, y, v = w[:n], w[n:2*n], w[2n] # Insteady of three directed_neighbor_allreduce operations for u, y, # and v respectively, we exploit one directed_neighbor_allreduce for # the combo vector w. This guarantees u, y, and v to be transmitted # simultanesly and avoids the mismatch between them. Experiments # show directed_neighbor_allreduce(w) is crutial for convergence of # push_diging. w = torch.zeros(2 * n + 1, 1).to(torch.double) x = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho) grad = x.grad.data.clone() w[n:2 * n] = grad x.grad.data.zero_() w[-1] = 1.0 grad_prev = w[n:2 * n].clone() bf.win_create(w, name="w_buff", zero_init=True) mse = [] for _ in range(maxite): bf.barrier() w[:n] = w[:n] - alpha * w[n:2 * n] bf.win_accumulate(w, name="w_buff", dst_weights={ rank: 1.0 / (outdegree * 2) for rank in bf.out_neighbor_ranks() }, require_mutex=True) w.div_(2) bf.barrier() w = bf.win_update_then_collect(name="w_buff") x.data = w[:n] / w[-1] loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho) grad = x.grad.data.clone() x.grad.data.zero_() w[n:2 * n] += grad - grad_prev grad_prev = grad if bf.rank() == 0: mse.append(torch.norm(x.data - w_opt, p=2)) bf.barrier() w = bf.win_update_then_collect(name="w_buff") x.data = w[:n] / w[-1] return x, mse