def hier_setup(): os.environ['BLUEFOG_NODES_PER_MACHINE'] = '2' bf.init() assert bf.size() % 2 == 0 machine_size = int(bf.size() // 2) bf.set_machine_topology(bf.ExponentialGraph(machine_size)) return bf.rank(), bf.size(), bf.local_rank(), bf.local_size()
def test_set_topology_fail_with_win_create(self): bf.init() size = bf.size() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return tensor = torch.FloatTensor([1]) window_name = "win_create_test" is_created = bf.win_create(tensor, window_name) assert is_created, "bf.win_create do not create window object successfully." if size == 1: expected_topology = nx.from_numpy_array(np.array([[0.5]]), create_using=nx.DiGraph) elif size == 2: expected_topology = nx.from_numpy_array(np.array([[0, 0.2], [0.2, 0]]), create_using=nx.DiGraph) else: expected_topology = RingGraph(size) is_set = bf.set_topology(expected_topology) assert not is_set, "bf.set_topology do not fail due to win_create." topology = bf.load_topology() assert isinstance(topology, nx.DiGraph) assert IsTopologyEquivalent(topology, ExponentialGraph(size)) is_freed = bf.win_free() assert is_freed, "bf.win_free do not free window object successfully."
def __init__(self, params, model, num_steps_per_communication): super(self.__class__, self).__init__(params) # use to control the behavior of win_accumulate dynamically. outdegree = len(bf.out_neighbor_ranks()) self.dst_weights = { rank: 1.0 / (outdegree + 1) for rank in bf.out_neighbor_ranks() } self.self_weight = 1.0 / (outdegree + 1) self.force_barrier = True named_parameters, models = _check_named_parameters(self, model) self._models = models self._parameter_names = {v: k for k, v in sorted(named_parameters)} self._handles = {} # store parameter -> handle self._named_ps_weights = {} self._named_extension_parameters = {} self._synchronized = False self._should_synchronize = True self._use_timeline = False self._num_steps_per_communication = num_steps_per_communication self._pushsum_delay = { v: self._num_steps_per_communication for _, v in sorted(named_parameters) } self._timeline_hook_handles = [] if bf.size() > 1: self._register_window() self._register_hooks()
def adjust_learning_rate(epoch, batch_idx): if epoch < args.warmup_epochs: epoch += float(batch_idx + 1) / len(train_loader) lr_adj = 1.0 / bf.size() * (epoch * (bf.size() - 1) / args.warmup_epochs + 1) elif epoch < 30: lr_adj = 1.0 elif epoch < 60: lr_adj = 1e-1 elif epoch < 80: lr_adj = 1e-2 else: lr_adj = 1e-3 for param_group in optimizer.param_groups: param_group["lr"] = ( args.base_lr * bf.size() * args.batches_per_allreduce * lr_adj )
def test_timeline_push_sum(self): # Use win_accumulate to simulate the push-sum algorithm (sync). outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) # we append the p at the last of data. x = torch.Tensor( [bf.rank() / (indegree + 1), 1.0 / bf.size() / (indegree + 1)]) # Remember we do not create buffer with 0. bf.win_create(x, name="x_buff") x = bf.win_update_then_collect(name="x_buff") for _ in range(10): bf.win_accumulate(x, name="x_buff", dst_weights={ rank: 1.0 / (outdegree + 1) for rank in bf.out_neighbor_ranks() }, require_mutex=True) x.div_(1 + outdegree) x = bf.win_update_then_collect(name="x_buff") bf.barrier() # Do not forget to sync at last! x = bf.win_update_then_collect(name="x_buff") file_name = f"{self.temp_file}{bf.rank()}.json" with open(file_name, 'r') as tf: timeline_text = tf.read() assert 'MPI_WIN_ACCUMULATE' in timeline_text, timeline_text assert 'ENQUEUE_WIN_ACCUMULATE' in timeline_text, timeline_text bf.win_free()
def test_win_get(self): """Test that the window get operation.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) neighbor_ranks = [(rank - 2**i) % size for i in range(indegree)] # in-neighbor avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_get_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_get(window_name) bf.barrier() recv_tensor = bf.win_update(window_name, clone=True) assert (list(recv_tensor.shape) == [DIM_SIZE] * dim), ("bf.win_get produce wrong shape tensor.") assert (recv_tensor.data - avg_value).abs().max() < EPSILON, ( "bf.win_get produce wrong tensor value " + "[{}-{}]!={} at rank {}.".format( recv_tensor.min(), recv_tensor.max(), avg_value, rank))
def InferDestinationFromSourceRanks( src_ranks: List[int], construct_adjacency_matrix: bool = False, ) -> Union[List[int], np.array]: """Infer the destination ranks from source ranks. This is collective communication call. Args: src_ranks: A list of destination ranks. construct_adjacency_matrix: If true, adjacency matrix will be return instead. Element w_{ij} represents the weights sending from node i to node j. We use column normalized style, i.e. the sum of receiving weight is 1. Raises: ValueError: If dst_ranks or src_ranks does not contain integer from 0 to size-1. Returns: If construct_adjacency_matrix is false, returns the destination ranks list. If construct_adjacency_matrix is true, returns the the sodestinationrce ranks list and a 2-D numpy array. """ is_valid, error_msg = _check_ranks(src_ranks, bf.rank(), bf.size()) assert is_valid, f"The format of src_ranks is wrong: {error_msg}" return _infer_topo( src_ranks, transpose=True, construct_adjacency_matrix=construct_adjacency_matrix, )
def test_win_update_with_given_weights(self): size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_create_{}_{}".format(dim, dtype) is_created = bf.win_create(tensor, window_name) assert is_created, "bf.win_create do not create window object successfully." # Test simple average rule. weight = 1.0 / (len(bf.in_neighbor_ranks()) + 1) sync_result = bf.win_update( window_name, self_weight=weight, neighbor_weights={x: weight for x in bf.in_neighbor_ranks()}) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update (weighted) produces wrong shape tensor.") assert (sync_result.data - rank).abs().max() < EPSILON, ( "bf.win_update (weighted) produces wrong tensor value " + "[{0}-{1}]!={2} at rank {2}.".format(sync_result.min(), sync_result.max(), rank))
def test_win_update_then_collect(self): size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] indegree = int(np.ceil(np.log2(size))) expected_result = rank * (indegree + 1) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_update_collect_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) # After the collect ops, the neighbro tensor will become zero. # So second win_update_then_collect should produce the same value. for _ in range(2): collect_tensor = bf.win_update_then_collect(window_name) assert (list(collect_tensor.shape) == [DIM_SIZE] * dim), ( "bf.win_update_then_collect produces wrong shape tensor.") assert (collect_tensor.data - expected_result).abs().max( ) < EPSILON, ( "bf.win_update_then_collect produces wrong tensor value " + "[{0}-{1}]!={2} at rank {2}.".format( collect_tensor.min(), collect_tensor.max(), rank))
def test_win_mutex_full(self): size = bf.size() rank = bf.rank() if size <= 2: fname = inspect.currentframe().f_code.co_name warnings.warn( "Skip {} because it only supports test over at least 3 nodes". format(fname)) return bf.set_topology(topology_util.FullyConnectedGraph(size)) dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] for dtype in dtypes: tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_mutex_full_{}".format(dtype) bf.win_create(tensor, window_name) if rank == 0: with bf.win_mutex(window_name, for_self=True): bf.barrier() time.sleep(1.01) else: bf.barrier() t_start = time.time() with bf.win_mutex(window_name): time.sleep(0.001) t_end = time.time() assert (t_end - t_start) > 1, \ "The mutex acquire time should be longer than 1 second" assert (t_end - t_start) < 2, \ "The mutex acquire time should be shorter than 2 second"
def _infer_topo( rank_list: List[int], transpose: bool, construct_adjacency_matrix: bool ): degree = len(rank_list) all_degree_list = bf.allgather(torch.tensor([degree], dtype=torch.int32)).numpy() all_rank_list = bf.allgather(torch.tensor(rank_list, dtype=torch.int32)).numpy() adjacency_dict = dict() displacement = 0 for i, degree in enumerate(all_degree_list): adjacency_dict[i] = sorted(all_rank_list[displacement : displacement + degree]) displacement += degree inv_adjacency_dict = collections.defaultdict(list) for k, adj in adjacency_dict.items(): for v in adj: inv_adjacency_dict[v].append(k) return_list = inv_adjacency_dict.get(bf.rank()) if return_list is None: return_list = [] if not construct_adjacency_matrix: return return_list # construct_adjacency_matrix W = np.eye(bf.size()) for k, adj in adjacency_dict.items(): W[k, adj] = 1 if transpose: W = W.T return return_list, W / W.sum(axis=1)
def __init__(self, params, model, num_steps_per_communication, pull_style): super(self.__class__, self).__init__(params) if pull_style: self.src_weights = None # use to control the behavior of win_get dynamically. else: self.dst_weights = None # use to control the behavior of win_put dynamically. self.force_barrier = False named_parameters, models = _check_named_parameters(self, model) self._models = models self._pull_style = pull_style self._parameter_names = {v: k for k, v in sorted(named_parameters)} self._handles = {} # store parameter -> handle self._synchronized = False self._should_synchronize = True self._use_timeline = False self._num_steps_per_communication = num_steps_per_communication self._bluefog_delay = { v: self._num_steps_per_communication for _, v in sorted(named_parameters) } self._timeline_hook_handles = [] if os.getenv('BLUEFOG_TIMELINE'): self.turn_on_timeline() if bf.size() > 1: self._register_window() self._register_hooks()
def test_bluefog_size(self): """Test that the size returned by bf.size() is correct.""" _, true_size = mpi_env_rank_and_size() bf.init() size = bf.size() # print("Size: ", true_size, size) assert true_size == size
def test_get_win_version_with_win_get(self): """Test version window is initialized, updated and cleared correctly with win get.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) neighbor_ranks = [(rank - 2**i) % size for i in range(indegree)] # in-neighbor dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_version_get_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) original_versions = list(bf.get_win_version(window_name).values()) bf.barrier() bf.win_get(window_name) bf.barrier() versions_after_win_get = list( bf.get_win_version(window_name).values()) bf.win_update(window_name, clone=True) versions_after_win_update = list( bf.get_win_version(window_name).values()) neighbor_ranks_number = len(neighbor_ranks) zero_number_in_original_versions = len( original_versions) - np.count_nonzero(original_versions) assert ((zero_number_in_original_versions) == neighbor_ranks_number ), ("version initialization is wrong.") zero_number_after_win_update = len( versions_after_win_update) - np.count_nonzero( versions_after_win_update) assert ((zero_number_after_win_update) == neighbor_ranks_number), ( "version clear up is wrong.") expected_versions_after_win_get = [1] * neighbor_ranks_number assert (versions_after_win_get == expected_versions_after_win_get ), ("version after win get is wrong.") for dtype, dim in itertools.product(dtypes, dims): window_name = "win_version_get_{}_{}".format(dim, dtype) is_freed = bf.win_free(window_name) assert is_freed, "bf.win_free do not free window object successfully."
def test_in_out_neighbors_expo2(self): bf.init() rank = bf.rank() size = bf.size() assert bf.set_topology(ExponentialGraph(size)) in_neighobrs = bf.in_neighbor_ranks() out_neighbors = bf.out_neighbor_ranks() degree = int(np.ceil(np.log2(size))) expected_in_neighbors = sorted([(rank - 2**i) % size for i in range(degree)]) expected_out_neighbors = sorted([(rank + 2**i) % size for i in range(degree)]) assert sorted(in_neighobrs) == expected_in_neighbors assert sorted(out_neighbors) == expected_out_neighbors
def problem_setup(net=LinearNet): bf.init() num_epochs = 50 batch_size = 128 num_train_per_node = 1024 num_test_per_node = 128 lr = 0.01 # Setup Problem problem_builder = LinearProblemBuilder() train_dataset = problem_builder.get_dataset(num_train_per_node) train_dataloader = DataLoader(train_dataset, batch_size=batch_size) test_dataset = problem_builder.get_dataset(num_test_per_node) test_dataloader = DataLoader(test_dataset, batch_size=batch_size) # Setup Model model = net(problem_builder.input_dim, problem_builder.output_dim) assert ( num_train_per_node*bf.size() >= model.num_parameters ), "The number of samples is too small making it an underdetermined system." # Setup Optimizer optimizer = optim.Adam(model.parameters(), lr=lr*bf.size()) bf.broadcast_parameters(model.state_dict(), root_rank=0) bf.broadcast_optimizer_state(optimizer, root_rank=0) return problem_builder, train_dataloader, test_dataloader, model, optimizer, num_epochs
def test_asscoicated_with_p(self): size = bf.size() rank = bf.rank() if size <= 3: fname = inspect.currentframe().f_code.co_name warnings.warn( "Skip {} because it only supports test over at least 3 nodes". format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU and not bf.nccl_built(): dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] bf.set_topology(topology_util.RingGraph(size)) bf.turn_on_win_ops_with_associated_p() for dtype, send_rank in itertools.product(dtypes, range(size)): tensor = torch.FloatTensor([23]).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_asscoicate_with_p_{}_{}".format( dtype, send_rank) bf.win_create(tensor, window_name) left_neighbor_rank = (send_rank - 1) % size right_neighbor_rank = (send_rank + 1) % size if rank == send_rank: bf.win_accumulate(tensor, name=window_name, self_weight=0.5, dst_weights={ left_neighbor_rank: 0.5, right_neighbor_rank: 0.5 }) bf.barrier() bf.win_update_then_collect(name=window_name) associated_p = bf.win_associated_p(name=window_name) if rank == send_rank: assert associated_p == 0.5, ( "associated_p for sender {} is wrong. Get {}".format( rank, associated_p)) elif (rank == left_neighbor_rank) or (rank == right_neighbor_rank): assert (associated_p - 1.5) < EPSILON, ( "associated_p for received neighbor {} is wrong. Get {}". format(rank, associated_p)) else: assert associated_p == 1.0, ( "associated_p for untouched node {} is wrong. Get {}". format(rank, associated_p)) bf.turn_off_win_ops_with_associated_p()
def test_infer_source_from_destination_ranks(topo_func): bf.init() size = bf.size() bf.set_topology(topo_func(size)) topo = bf.load_topology() in_neighbors = bf.in_neighbor_ranks() out_neighbors = bf.out_neighbor_ranks() # Make the W into average rule. expected_W = (nx.to_numpy_array(topo) > 0).astype(float) expected_W /= expected_W.sum(axis=0) dst_ranks, W = InferSourceFromDestinationRanks( dst_ranks=out_neighbors, construct_adjacency_matrix=True) assert sorted(dst_ranks) == in_neighbors np.testing.assert_allclose(W, expected_W)
def test_set_and_load_topology(self): bf.init() size = bf.size() if size == 4: expected_topology = nx.DiGraph( np.array([[1 / 3., 1 / 3., 1 / 3., 0.], [0., 1 / 3., 1 / 3., 1 / 3.], [1 / 3., 0., 1 / 3., 1 / 3.], [1 / 3., 1 / 3., 0., 1 / 3.]])) elif size == 1: expected_topology = nx.DiGraph(np.array([[1.0]])) else: expected_topology = ExponentialGraph(size) topology = bf.load_topology() assert isinstance(topology, nx.DiGraph) assert IsTopologyEquivalent(expected_topology, topology)
def test_win_put_with_varied_tensor_elements(self): """Test that the window put operation.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) neighbor_ranks = [(rank - 2**i) % size for i in range(indegree)] # in-neighbor avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) base_tensor = torch.arange( DIM_SIZE**dim, dtype=torch.float32).view_as(tensor).div(1000) tensor = self.cast_and_place(tensor, dtype) base_tensor = self.cast_and_place(base_tensor, dtype) tensor = tensor + base_tensor window_name = "win_put_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_put(tensor, window_name) bf.barrier() sync_result = bf.win_update(window_name) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update after win_put produces wrong shape tensor.") assert ( (sync_result - base_tensor).data - avg_value).abs().max() < EPSILON, ( "bf.win_update after win_put produces wrong tensor value " + "[{}-{}]!={} at rank {}.".format( (sync_result - base_tensor).min(), (sync_result - base_tensor).max(), avg_value, rank)) time.sleep(0.5) for dtype, dim in itertools.product(dtypes, dims): window_name = "win_put_{}_{}".format(dim, dtype) is_freed = bf.win_free(window_name) assert is_freed, "bf.win_free do not free window object successfully."
def __init__(self, params, model, reduce_type, num_steps_per_communication=1): super(self.__class__, self).__init__(params) named_parameters, models = _check_named_parameters(self, model) # knobs for neighbor communication behavior self.self_weight = None self.neighbor_weights = None self.send_neighbors = None self.neighbor_machine_weights = None self.send_neighbor_machines = None self.enable_topo_check = False self._models = models self._parameter_names = {v: k for k, v in sorted(named_parameters)} self._handles = {} self._requires_update = set() self._synchronized = False self._should_synchronize = True self._timeline_hook_handles = [] self._use_timeline = False self._num_steps_per_communication = num_steps_per_communication self._reduce_type_str = reduce_type # _reduce_method: 0 for allreduce, and 1 for neighbor_allreduce if self._reduce_type_str == "allreduce": self._reduce_method = 0 elif self._reduce_type_str == "neighbor.allreduce": self._reduce_method = 1 elif self._reduce_type_str == "hierarchical.neighbor.allreduce": self._reduce_method = 2 else: raise ValueError( "Unknown reduce type for internal class _DistributedReduceOptimizer" ) self._reduce_delay = { v: self._num_steps_per_communication for _, v in sorted(named_parameters) } if os.getenv('BLUEFOG_TIMELINE'): self.turn_on_timeline() if bf.size() > 1: self._register_hooks()
def test_asscoicated_with_p_random_test(self): size = bf.size() rank = bf.rank() dtypes = [torch.FloatTensor, torch.DoubleTensor] # Current, nccl version hasn't supported the associated with p yet. if TEST_ON_GPU and not bf.nccl_built(): dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1] bf.turn_on_win_ops_with_associated_p() for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([23] * dim)).fill_(1) tensor = self.cast_and_place(tensor, dtype) window_name = "win_asscoicate_with_p_random_{}_{}".format( dim, dtype) bf.win_create(tensor, window_name, zero_init=True) for _ in range(10): random_weights = np.random.rand( len(bf.out_neighbor_ranks()) + 1) random_weights /= random_weights.sum() self_weight = random_weights[-1] dst_weights = { r: random_weights[i] for i, r in enumerate(bf.out_neighbor_ranks()) } bf.win_put(tensor, self_weight=self_weight, dst_weights=dst_weights, name=window_name, require_mutex=True) bf.win_update(name=window_name, require_mutex=True) bf.win_accumulate(tensor, name=window_name, require_mutex=True, self_weight=self_weight, dst_weights=dst_weights) bf.win_update_then_collect(name=window_name) bf.barrier() bf.win_update_then_collect(name=window_name) associated_p = bf.win_associated_p(name=window_name) # Because the associated p should operate the same as tensor always # the following assert should be true no matter what order is excuted. assert abs(associated_p - tensor.data[0]) < EPSILON bf.turn_off_win_ops_with_associated_p()
def test_in_out_neighbors_biring(self): bf.init() rank = bf.rank() size = bf.size() assert bf.set_topology(RingGraph(size)) in_neighobrs = bf.in_neighbor_ranks() out_neighbors = bf.out_neighbor_ranks() expected_in_neighbors = list( set(map(lambda x: x % size, [rank - 1, rank + 1]))) expected_out_neighbors = list( set(map(lambda x: x % size, [rank - 1, rank + 1]))) if size <= 1: expected_in_neighbors = [] expected_out_neighbors = [] assert sorted(in_neighobrs) == expected_in_neighbors assert sorted(out_neighbors) == expected_out_neighbors
def test_win_put_with_given_destination(self): """Test that the window put operation with given destination.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. indegree = int(np.ceil(np.log2(size))) # We use given destination to form a (right-)ring. avg_value = (rank * indegree + 1.23 * ((rank - 1) % size)) / float(indegree + 1) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_put_given_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_put(tensor, window_name, dst_weights={(rank + 1) % size: 1.23}) bf.barrier() sync_result = bf.win_update(window_name) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update after win_put given destination produces wrong shape tensor." ) assert (sync_result.data - avg_value).abs().max() < EPSILON, ( "bf.win_update after win_put given destination produces wrong tensor value " + "[{}-{}]!={} at rank {}.".format( sync_result.min(), sync_result.max(), avg_value, rank)) time.sleep(0.5) for dtype, dim in itertools.product(dtypes, dims): window_name = "win_put_given_{}_{}".format(dim, dtype) is_freed = bf.win_free(window_name) assert is_freed, "bf.win_free do not free window object successfully."
def test_win_free_all(self): size = bf.size() dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1) tensor = self.cast_and_place(tensor, dtype) window_name = "win_create_{}_{}".format(dim, dtype) is_created = bf.win_create(tensor, window_name) assert is_created, "bf.win_create do not create window object successfully." is_freed = bf.win_free() assert is_freed, "bf.win_free do not free window object successfully."
def __init__(self, params, lr, L, communication_type): ''' lr: Learning rate L: Number of batches ''' if lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) defaults = dict(lr=lr, L=L) super(ExactDiff, self).__init__(params, defaults) self._communication_type = communication_type self.lr = lr self.L = L self._q = bf.size() self._states = {} for groups in self.param_groups: for p in groups['params']: self._states[p] = { 'psi': torch.clone(p), 'phi': torch.zeros_like(p), 'handle': None }
def __init__(self, params, model, backward_passes_per_step=1): super(self.__class__, self).__init__(params) named_parameters, models = _check_named_parameters(self, model) self._models = models self._parameter_names = {v: k for k, v in sorted(named_parameters)} self._handles = {} self._grad_accs = [] self._requires_update = set() self._synchronized = False self._should_synchronize = True self._timeline_hook_handles = [] self._use_timeline = False self._backward_passes_per_step = backward_passes_per_step self._allreduce_delay = { v: self._backward_passes_per_step for _, v in sorted(named_parameters) } if os.getenv('BLUEFOG_TIMELINE'): self.turn_on_timeline() if bf.size() > 1: self._register_hooks()
def test_win_mutex_given_ranks(self): size = bf.size() rank = bf.rank() if size < 4: fname = inspect.currentframe().f_code.co_name warnings.warn( "Skip {} because it only supports test above 4 nodes".format( fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] for dtype in dtypes: tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_mutex_given_ranks_{}".format(dtype) bf.win_create(tensor, window_name) if rank == 0: with bf.win_mutex(window_name, for_self=True, ranks=[1]): bf.barrier() time.sleep(1.01) elif rank == 1: bf.barrier() t_start = time.time() with bf.win_mutex(window_name, ranks=[0]): time.sleep(0.001) t_end = time.time() assert (t_end - t_start) > 1 elif rank == 2: bf.barrier() t_start = time.time() with bf.win_mutex(window_name, ranks=[0]): time.sleep(0.001) t_end = time.time() assert (t_end - t_start) < 0.1 else: bf.barrier()
def test_win_accumulate_with_given_destination(self): """Test that the window accumulate operation with given destination.""" size = bf.size() rank = bf.rank() if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] avg_value = rank + ((rank - 1) % size) * 1.23 / 2.0 dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_accumulate_{}_{}".format(dim, dtype) bf.win_create(tensor, window_name) bf.win_accumulate(tensor, window_name, dst_weights={(rank + 1) % size: 1.23}) bf.barrier() sync_result = bf.win_update(window_name, self_weight=0.5, neighbor_weights={ (rank - 1) % size: 0.5 }) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ( "bf.win_update after win_accmulate given destination produces wrong shape tensor." ) assert (sync_result.data - avg_value).abs().max() < EPSILON, ( "bf.win_update after win_accmulate given destination produces wrong tensor value " + "[{}-{}]!={} at rank {}.".format( sync_result.min(), sync_result.max(), avg_value, rank))
def test_win_create_and_sync_and_free(self): """Test that the window create and free objects correctly.""" size = bf.size() rank = bf.rank() # OpenMPI implementation seems won't allow win_create on size 1. if size <= 1: fname = inspect.currentframe().f_code.co_name warnings.warn("Skip {} due to size 1".format(fname)) return dtypes = [torch.FloatTensor, torch.DoubleTensor] if TEST_ON_GPU: dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor] # By default, we use exponential two ring topology. dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank) tensor = self.cast_and_place(tensor, dtype) window_name = "win_create_{}_{}".format(dim, dtype) is_created = bf.win_create(tensor, window_name) assert is_created, "bf.win_create do not create window object successfully." sync_result = bf.win_update(window_name) assert (list(sync_result.shape) == [DIM_SIZE] * dim), ("bf.win_update produce wrong shape tensor.") assert (sync_result.data.min() == rank), ( "bf.win_update produces wrong tensor value " + "{0}!={1} at rank {1}.".format(sync_result.data.min(), rank)) assert (sync_result.data.max() == rank), ( "bf.win_update produces wrong tensor value " + "{0}!={1} at rank {1}.".format(sync_result.data.max(), rank)) for dtype, dim in itertools.product(dtypes, dims): window_name = "win_create_{}_{}".format(dim, dtype) is_freed = bf.win_free(window_name) assert is_freed, "bf.win_free do not free window object successfully."