def test_set_topology_fail_with_win_create(self): bf.init() size = bf.size() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return tensor = torch.FloatTensor([1]) window_name = "win_create_test" is_created = bf.win_create(tensor, window_name) assert is_created, "bf.win_create do not create window object successfully." if size == 1: expected_topology = nx.from_numpy_array(np.array([[0.5]]), create_using=nx.DiGraph) elif size == 2: expected_topology = nx.from_numpy_array(np.array([[0, 0.2], [0.2, 0]]), create_using=nx.DiGraph) else: expected_topology = RingGraph(size) is_set = bf.set_topology(expected_topology) assert not is_set, "bf.set_topology do not fail due to win_create." topology = bf.load_topology() assert isinstance(topology, nx.DiGraph) assert IsTopologyEquivalent(topology, ExponentialGraph(size)) is_freed = bf.win_free() assert is_freed, "bf.win_free do not free window object successfully."
def test_dynamic_neighbor_allreduce_optimizer(device, atc_style, kwargs): error_threshold = kwargs.get("error_threshold", 1.5) problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs = \ problem_setup() isCUDA = pin_model_to_device(device, model) base_dist_optimizer = (bf.DistributedAdaptThenCombineOptimizer if atc_style else bf.DistributedAdaptWithCombineOptimizer) optimizer = base_dist_optimizer(optimizer, model=model, communication_type=bf.CommunicationType.neighbor_allreduce) dynamic_topo_gen = topology_util.GetDynamicOnePeerSendRecvRanks( bf.load_topology(), bf.rank()) # Train and test train_mse = [] test_mse = [] for _ in range(num_epochs): dynamic_neighbor_allreduce_train(model, optimizer, train_dataloader, isCUDA, dynamic_topo_gen) train_mse.append(evaluation(model, train_dataloader, isCUDA)) test_mse.append(evaluation(model, test_dataloader, isCUDA)) train_mse = np.array(train_mse) test_mse = np.array(test_mse) # Check if the MSEs in the last three epochs are small enough assert ( train_mse[-3:].max() < error_threshold*problem_builder.noise_level**2 ), "Train MSE in the last three epochs doesn't coverge." assert ( test_mse[-3:].max() < error_threshold*problem_builder.noise_level**2 ), "Train MSE in the last three epochs doesn't coverge."
def test_set_and_load_topology(self): bf.init() size = bf.size() if size == 4: expected_topology = nx.DiGraph( np.array([[1 / 3., 1 / 3., 1 / 3., 0.], [0., 1 / 3., 1 / 3., 1 / 3.], [1 / 3., 0., 1 / 3., 1 / 3.], [1 / 3., 1 / 3., 0., 1 / 3.]])) elif size == 1: expected_topology = nx.DiGraph(np.array([[1.0]])) else: expected_topology = ExponentialGraph(size) topology = bf.load_topology() assert isinstance(topology, nx.DiGraph) assert IsTopologyEquivalent(expected_topology, topology)
def test_infer_source_from_destination_ranks(topo_func): bf.init() size = bf.size() bf.set_topology(topo_func(size)) topo = bf.load_topology() in_neighbors = bf.in_neighbor_ranks() out_neighbors = bf.out_neighbor_ranks() # Make the W into average rule. expected_W = (nx.to_numpy_array(topo) > 0).astype(float) expected_W /= expected_W.sum(axis=0) dst_ranks, W = InferSourceFromDestinationRanks( dst_ranks=out_neighbors, construct_adjacency_matrix=True) assert sorted(dst_ranks) == in_neighbors np.testing.assert_allclose(W, expected_W)
def diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') topology = bf.load_topology() self_weight, neighbor_weights = topology_util.GetRecvWeights( topology, bf.rank()) w = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) phi = w.clone() mse = [] for i in range(maxite): # calculate loccal gradient via pytorch autograd loss_step(X, y, w, tensor_name='neighbor.allreduce.local_variable', loss=loss, rho=rho) # diffusion with torch.no_grad(): phi = w - alpha * w.grad.data w.data = bf.neighbor_allreduce(phi, self_weight=self_weight, src_weights=neighbor_weights, name='local variable') w.grad.data.zero_() # record convergence if bf.rank() == 0: mse.append(torch.norm(w.data - w_opt.data, p=2)) return w, mse
x_bar = bf.allreduce(x, average=True) mse = [torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)] if not args.asynchronous_mode: self_weight = None neighbor_weights = None send_neighbors = None if args.enable_dynamic_topology: if args.virtual_topology == "InnerOuterExpo2": dynamic_neighbor_allreduce_gen = topology_util.GetInnerOuterExpo2DynamicSendRecvRanks( bf.size(), local_size=bf.local_size(), self_rank=bf.rank()) else: dynamic_neighbor_allreduce_gen = topology_util.GetDynamicOnePeerSendRecvRanks( bf.load_topology(), bf.rank()) for ite in range(args.max_iters): if args.enable_dynamic_topology: send_neighbors, recv_neighbors = next( dynamic_neighbor_allreduce_gen) neighbor_weights = { r: 1 / (len(recv_neighbors) + 1) for r in recv_neighbors } self_weight = 1 / (len(recv_neighbors) + 1) x = bf.neighbor_allreduce(x, name='x', self_weight=self_weight, neighbor_weights=neighbor_weights,
def exact_diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, use_Abar=True, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') topology = bf.load_topology() self_weight, neighbor_weights = topology_util.GetRecvWeights( topology, bf.rank()) if bf.rank() == 0: print('self weights with A: {}\n'.format(self_weight)) print('neighbor weights with A:\n') for k, v in neighbor_weights.items(): print(k, v) w = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) phi, psi, psi_prev = w.clone(), w.clone(), w.clone() mse = [] # construct A_bar if use_Abar: self_weight = (self_weight + 1) / 2 for k, v in neighbor_weights.items(): neighbor_weights[k] = v / 2 for i in range(maxite): # calculate loccal gradient via pytorch autograd loss_step(X, y, w, tensor_name='neighbor.allreduce.local_variable', loss=loss, rho=rho) # exact diffusion psi = w - alpha * w.grad.data phi = psi + w.data - psi_prev w.data = bf.neighbor_allreduce(phi, self_weight, neighbor_weights, name='local variable') psi_prev = psi.clone() w.grad.data.zero_() # record convergence if bf.rank() == 0: mse.append(torch.norm(w.data - w_opt.data, p=2)) return w, mse