def test_win_mutex_full(self): size = bf.size() rank = bf.rank() if size <= 2: fname = inspect.currentframe().f_code.co_name warnings.warn( "Skip {} because it only supports test over at least 3 nodes". format(fname)) return bf.set_topology(topology_util.FullyConnectedGraph(size)) dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] for dtype in dtypes: tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_mutex_full_{}".format(dtype) bf.win_create(tensor, window_name) if rank == 0: with bf.win_mutex(window_name, for_self=True): bf.barrier() time.sleep(1.01) else: bf.barrier() t_start = time.time() with bf.win_mutex(window_name): time.sleep(0.001) t_end = time.time() assert (t_end - t_start) > 1, \ "The mutex acquire time should be longer than 1 second" assert (t_end - t_start) < 2, \ "The mutex acquire time should be shorter than 2 second"
def test_timeline_push_sum(self): # Use win_accumulate to simulate the push-sum algorithm (sync). outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) # we append the p at the last of data. x = torch.Tensor( [bf.rank() / (indegree + 1), 1.0 / bf.size() / (indegree + 1)]) # Remember we do not create buffer with 0. bf.win_create(x, name="x_buff") x = bf.win_update_then_collect(name="x_buff") for _ in range(10): bf.win_accumulate(x, name="x_buff", dst_weights={ rank: 1.0 / (outdegree + 1) for rank in bf.out_neighbor_ranks() }, require_mutex=True) x.div_(1 + outdegree) x = bf.win_update_then_collect(name="x_buff") bf.barrier() # Do not forget to sync at last! x = bf.win_update_then_collect(name="x_buff") file_name = f"{self.temp_file}{bf.rank()}.json" with open(file_name, 'r') as tf: timeline_text = tf.read() assert 'MPI_WIN_ACCUMULATE' in timeline_text, timeline_text assert 'ENQUEUE_WIN_ACCUMULATE' in timeline_text, timeline_text bf.win_free()
def test_win_get(self): """Test that the window get operation.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) neighbor_ranks = [(rank - 2**i) % size for i in range(indegree)] # in-neighbor avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_get_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_get(window_name) bf.barrier() recv_tensor = bf.win_update(window_name, clone=True) assert (list(recv_tensor.shape) == [DIM_SIZE] * dim), ("bf.win_get produce wrong shape tensor.") assert (recv_tensor.data - avg_value).abs().max() < EPSILON, ( "bf.win_get produce wrong tensor value " + "[{}-{}]!={} at rank {}.".format( recv_tensor.min(), recv_tensor.max(), avg_value, rank))
def test_get_win_version_with_win_get(self): """Test version window is initialized, updated and cleared correctly with win get.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) neighbor_ranks = [(rank - 2**i) % size for i in range(indegree)] # in-neighbor dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_version_get_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) original_versions = list(bf.get_win_version(window_name).values()) bf.barrier() bf.win_get(window_name) bf.barrier() versions_after_win_get = list( bf.get_win_version(window_name).values()) bf.win_update(window_name, clone=True) versions_after_win_update = list( bf.get_win_version(window_name).values()) neighbor_ranks_number = len(neighbor_ranks) zero_number_in_original_versions = len( original_versions) - np.count_nonzero(original_versions) assert ((zero_number_in_original_versions) == neighbor_ranks_number ), ("version initialization is wrong.") zero_number_after_win_update = len( versions_after_win_update) - np.count_nonzero( versions_after_win_update) assert ((zero_number_after_win_update) == neighbor_ranks_number), ( "version clear up is wrong.") expected_versions_after_win_get = [1] * neighbor_ranks_number assert (versions_after_win_get == expected_versions_after_win_get ), ("version after win get is wrong.") for dtype, dim in itertools.product(dtypes, dims): window_name = "win_version_get_{}_{}".format(dim, dtype) is_freed = bf.win_free(window_name) assert is_freed, "bf.win_free do not free window object successfully."
def test_asscoicated_with_p(self): size = bf.size() rank = bf.rank() if size <= 3: fname = inspect.currentframe().f_code.co_name warnings.warn( "Skip {} because it only supports test over at least 3 nodes". format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU and not bf.nccl_built(): dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] bf.set_topology(topology_util.RingGraph(size)) bf.turn_on_win_ops_with_associated_p() for dtype, send_rank in itertools.product(dtypes, range(size)): tensor = torch.FloatTensor([23]).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_asscoicate_with_p_{}_{}".format( dtype, send_rank) bf.win_create(tensor, window_name) left_neighbor_rank = (send_rank - 1) % size right_neighbor_rank = (send_rank + 1) % size if rank == send_rank: bf.win_accumulate(tensor, name=window_name, self_weight=0.5, dst_weights={ left_neighbor_rank: 0.5, right_neighbor_rank: 0.5 }) bf.barrier() bf.win_update_then_collect(name=window_name) associated_p = bf.win_associated_p(name=window_name) if rank == send_rank: assert associated_p == 0.5, ( "associated_p for sender {} is wrong. Get {}".format( rank, associated_p)) elif (rank == left_neighbor_rank) or (rank == right_neighbor_rank): assert (associated_p - 1.5) < EPSILON, ( "associated_p for received neighbor {} is wrong. Get {}". format(rank, associated_p)) else: assert associated_p == 1.0, ( "associated_p for untouched node {} is wrong. Get {}". format(rank, associated_p)) bf.turn_off_win_ops_with_associated_p()
def step(self, closure=None): if self.force_barrier: bf.barrier() # some validation here? if self._should_synchronize: if self._synchronized: warnings.warn( "optimizer.step() called without " "optimizer.skip_synchronize() context after " "optimizer.synchronize(). This can cause training " "slowdown. You may want to consider using " "optimizer.skip_synchronize() context if you use " "optimizer.synchronize() in your code.") self.synchronize() self._synchronized = False return super(self.__class__, self).step(closure)
def test_win_put_with_varied_tensor_elements(self): """Test that the window put operation.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) neighbor_ranks = [(rank - 2**i) % size for i in range(indegree)] # in-neighbor avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) base_tensor = torch.arange( DIM_SIZE**dim, dtype=torch.float32).view_as(tensor).div(1000) tensor = self.cast_and_place(tensor, dtype) base_tensor = self.cast_and_place(base_tensor, dtype) tensor = tensor + base_tensor window_name = "win_put_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_put(tensor, window_name) bf.barrier() sync_result = bf.win_update(window_name) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update after win_put produces wrong shape tensor.") assert ( (sync_result - base_tensor).data - avg_value).abs().max() < EPSILON, ( "bf.win_update after win_put produces wrong tensor value " + "[{}-{}]!={} at rank {}.".format( (sync_result - base_tensor).min(), (sync_result - base_tensor).max(), avg_value, rank)) time.sleep(0.5) for dtype, dim in itertools.product(dtypes, dims): window_name = "win_put_{}_{}".format(dim, dtype) is_freed = bf.win_free(window_name) assert is_freed, "bf.win_free do not free window object successfully."
def test_asscoicated_with_p_random_test(self): size = bf.size() rank = bf.rank() dtypes = [torch.FloatTensor, torch.DoubleTensor] # Current, nccl version hasn't supported the associated with p yet. if TEST_ON_GPU and not bf.nccl_built(): dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1] bf.turn_on_win_ops_with_associated_p() for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([23] * dim)).fill_(1) tensor = self.cast_and_place(tensor, dtype) window_name = "win_asscoicate_with_p_random_{}_{}".format( dim, dtype) bf.win_create(tensor, window_name, zero_init=True) for _ in range(10): random_weights = np.random.rand( len(bf.out_neighbor_ranks()) + 1) random_weights /= random_weights.sum() self_weight = random_weights[-1] dst_weights = { r: random_weights[i] for i, r in enumerate(bf.out_neighbor_ranks()) } bf.win_put(tensor, self_weight=self_weight, dst_weights=dst_weights, name=window_name, require_mutex=True) bf.win_update(name=window_name, require_mutex=True) bf.win_accumulate(tensor, name=window_name, require_mutex=True, self_weight=self_weight, dst_weights=dst_weights) bf.win_update_then_collect(name=window_name) bf.barrier() bf.win_update_then_collect(name=window_name) associated_p = bf.win_associated_p(name=window_name) # Because the associated p should operate the same as tensor always # the following assert should be true no matter what order is excuted. assert abs(associated_p - tensor.data[0]) < EPSILON bf.turn_off_win_ops_with_associated_p()
def test_win_put_with_given_destination(self): """Test that the window put operation with given destination.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) # We use given destination to form a (right-)ring. avg_value = (rank * indegree + 1.23 * ((rank - 1) % size)) / float(indegree + 1) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_put_given_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_put(tensor, window_name, dst_weights={(rank + 1) % size: 1.23}) bf.barrier() sync_result = bf.win_update(window_name) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update after win_put given destination produces wrong shape tensor." ) assert (sync_result.data - avg_value).abs().max() < EPSILON, ( "bf.win_update after win_put given destination produces wrong tensor value " + "[{}-{}]!={} at rank {}.".format( sync_result.min(), sync_result.max(), avg_value, rank)) time.sleep(0.5) for dtype, dim in itertools.product(dtypes, dims): window_name = "win_put_given_{}_{}".format(dim, dtype) is_freed = bf.win_free(window_name) assert is_freed, "bf.win_free do not free window object successfully."
def test_win_accumulate_with_given_destination(self): """Test that the window accumulate operation with given destination.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] avg_value = rank + ((rank - 1) % size) * 1.23 / 2.0 dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_accumulate_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_accumulate(tensor, window_name, dst_weights={(rank + 1) % size: 1.23}) bf.barrier() sync_result = bf.win_update(window_name, self_weight=0.5, neighbor_weights={ (rank - 1) % size: 0.5 }) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update after win_accmulate given destination produces wrong shape tensor." ) assert (sync_result.data - avg_value).abs().max() < EPSILON, ( "bf.win_update after win_accmulate given destination produces wrong tensor value " + "[{}-{}]!={} at rank {}.".format( sync_result.min(), sync_result.max(), avg_value, rank))
def test_win_get_with_given_sources(self): """Test that the window get operation with given sources.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # We use given destination to form a (right-)ring. avg_value = (rank + 1.23 * ((rank - 1) % size)) / float(2) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_get_given_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_get(window_name, src_weights={(rank - 1) % size: 1.23}) bf.barrier() recv_tensor = bf.win_update(window_name, self_weight=0.5, neighbor_weights={ (rank - 1) % size: 0.5 }, clone=True) assert (list(recv_tensor.shape) == [DIM_SIZE] * dim), ( "bf.win_get with given sources produces wrong shape tensor.") assert (recv_tensor.data - avg_value).abs().max() < EPSILON, ( "bf.win_get with given sources produces wrong tensor value " + "[{}-{}]!={} at rank {}.".format( recv_tensor.min(), recv_tensor.max(), avg_value, rank))
def test_win_mutex_given_ranks(self): size = bf.size() rank = bf.rank() if size < 4: fname = inspect.currentframe().f_code.co_name warnings.warn( "Skip {} because it only supports test above 4 nodes".format( fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] for dtype in dtypes: tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_mutex_given_ranks_{}".format(dtype) bf.win_create(tensor, window_name) if rank == 0: with bf.win_mutex(window_name, for_self=True, ranks=[1]): bf.barrier() time.sleep(1.01) elif rank == 1: bf.barrier() t_start = time.time() with bf.win_mutex(window_name, ranks=[0]): time.sleep(0.001) t_end = time.time() assert (t_end - t_start) > 1 elif rank == 2: bf.barrier() t_start = time.time() with bf.win_mutex(window_name, ranks=[0]): time.sleep(0.001) t_end = time.time() assert (t_end - t_start) < 0.1 else: bf.barrier()
} self_weight = 1 / (1 + outdegree) bf.win_accumulate(x, name="x", self_weight=self_weight, dst_weights=dst_weights, require_mutex=True) bf.win_update_then_collect(name="x") associated_p = bf.win_associated_p(name="x") mse.append( torch.norm(x / associated_p - x_bar, p=2) / torch.norm(x_bar, p=2)) # Do not forget to sync at last! bf.barrier() bf.win_update_then_collect(name="x") associated_p = bf.win_associated_p(name="x") print(f"associated p at {bf.rank()} is {associated_p}") bf.turn_off_win_ops_with_associated_p() mse.append( torch.norm(x / associated_p - x_bar, p=2) / torch.norm(x_bar, p=2)) bf.win_free(name="x") else: p = torch.DoubleTensor([1.0]).to(x.device) x_ext = torch.cat([x, p], 0) bf.win_create(x_ext, name="x_ext", zero_init=True) for i in range(args.max_iters): if args.enable_dynamic_topology: num_out_neighbors = len(bf.out_neighbor_ranks()) sent_neighbor = bf.out_neighbor_ranks()[i % num_out_neighbors]
def push_diging(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) # We let w = col{u, y, v}, i.e., u, y, v = w[:n], w[n:2*n], w[2n] # Insteady of three directed_neighbor_allreduce operations for u, y, # and v respectively, we exploit one directed_neighbor_allreduce for # the combo vector w. This guarantees u, y, and v to be transmitted # simultanesly and avoids the mismatch between them. Experiments # show directed_neighbor_allreduce(w) is crutial for convergence of # push_diging. w = torch.zeros(2 * n + 1, 1).to(torch.double) x = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho) grad = x.grad.data.clone() w[n:2 * n] = grad x.grad.data.zero_() w[-1] = 1.0 grad_prev = w[n:2 * n].clone() bf.win_create(w, name="w_buff", zero_init=True) mse = [] for _ in range(maxite): bf.barrier() w[:n] = w[:n] - alpha * w[n:2 * n] bf.win_accumulate(w, name="w_buff", dst_weights={ rank: 1.0 / (outdegree * 2) for rank in bf.out_neighbor_ranks() }, require_mutex=True) w.div_(2) bf.barrier() w = bf.win_update_then_collect(name="w_buff") x.data = w[:n] / w[-1] loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho) grad = x.grad.data.clone() x.grad.data.zero_() w[n:2 * n] += grad - grad_prev grad_prev = grad if bf.rank() == 0: mse.append(torch.norm(x.data - w_opt, p=2)) bf.barrier() w = bf.win_update_then_collect(name="w_buff") x.data = w[:n] / w[-1] return x, mse