Пример #1
0
    def test_win_mutex_full(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 2:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn(
                "Skip {} because it only supports test over at least 3 nodes".
                format(fname))
            return
        bf.set_topology(topology_util.FullyConnectedGraph(size))

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        for dtype in dtypes:
            tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_mutex_full_{}".format(dtype)
            bf.win_create(tensor, window_name)

            if rank == 0:
                with bf.win_mutex(window_name, for_self=True):
                    bf.barrier()
                    time.sleep(1.01)
            else:
                bf.barrier()
                t_start = time.time()
                with bf.win_mutex(window_name):
                    time.sleep(0.001)
                t_end = time.time()
                assert (t_end - t_start) > 1, \
                    "The mutex acquire time should be longer than 1 second"
                assert (t_end - t_start) < 2, \
                    "The mutex acquire time should be shorter than 2 second"
Пример #2
0
    def test_timeline_push_sum(self):
        # Use win_accumulate to simulate the push-sum algorithm (sync).
        outdegree = len(bf.out_neighbor_ranks())
        indegree = len(bf.in_neighbor_ranks())
        # we append the p at the last of data.
        x = torch.Tensor(
            [bf.rank() / (indegree + 1), 1.0 / bf.size() / (indegree + 1)])

        # Remember we do not create buffer with 0.
        bf.win_create(x, name="x_buff")
        x = bf.win_update_then_collect(name="x_buff")

        for _ in range(10):
            bf.win_accumulate(x,
                              name="x_buff",
                              dst_weights={
                                  rank: 1.0 / (outdegree + 1)
                                  for rank in bf.out_neighbor_ranks()
                              },
                              require_mutex=True)
            x.div_(1 + outdegree)
            x = bf.win_update_then_collect(name="x_buff")

        bf.barrier()
        # Do not forget to sync at last!
        x = bf.win_update_then_collect(name="x_buff")

        file_name = f"{self.temp_file}{bf.rank()}.json"
        with open(file_name, 'r') as tf:
            timeline_text = tf.read()
            assert 'MPI_WIN_ACCUMULATE' in timeline_text, timeline_text
            assert 'ENQUEUE_WIN_ACCUMULATE' in timeline_text, timeline_text

        bf.win_free()
Пример #3
0
    def test_win_get(self):
        """Test that the window get operation."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        neighbor_ranks = [(rank - 2**i) % size
                          for i in range(indegree)]  # in-neighbor
        avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_get_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            bf.win_get(window_name)
            bf.barrier()
            recv_tensor = bf.win_update(window_name, clone=True)

            assert (list(recv_tensor.shape) == [DIM_SIZE] *
                    dim), ("bf.win_get produce wrong shape tensor.")
            assert (recv_tensor.data - avg_value).abs().max() < EPSILON, (
                "bf.win_get produce wrong tensor value " +
                "[{}-{}]!={} at rank {}.".format(
                    recv_tensor.min(), recv_tensor.max(), avg_value, rank))
Пример #4
0
    def test_get_win_version_with_win_get(self):
        """Test version window is initialized, updated and cleared correctly with win get."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        neighbor_ranks = [(rank - 2**i) % size
                          for i in range(indegree)]  # in-neighbor

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([23] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_version_get_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            original_versions = list(bf.get_win_version(window_name).values())
            bf.barrier()
            bf.win_get(window_name)
            bf.barrier()
            versions_after_win_get = list(
                bf.get_win_version(window_name).values())
            bf.win_update(window_name, clone=True)
            versions_after_win_update = list(
                bf.get_win_version(window_name).values())
            neighbor_ranks_number = len(neighbor_ranks)

            zero_number_in_original_versions = len(
                original_versions) - np.count_nonzero(original_versions)
            assert ((zero_number_in_original_versions) == neighbor_ranks_number
                    ), ("version initialization is wrong.")

            zero_number_after_win_update = len(
                versions_after_win_update) - np.count_nonzero(
                    versions_after_win_update)
            assert ((zero_number_after_win_update) == neighbor_ranks_number), (
                "version clear up is wrong.")

            expected_versions_after_win_get = [1] * neighbor_ranks_number

            assert (versions_after_win_get == expected_versions_after_win_get
                    ), ("version after win get is wrong.")

        for dtype, dim in itertools.product(dtypes, dims):
            window_name = "win_version_get_{}_{}".format(dim, dtype)
            is_freed = bf.win_free(window_name)
            assert is_freed, "bf.win_free do not free window object successfully."
Пример #5
0
    def test_asscoicated_with_p(self):
        size = bf.size()
        rank = bf.rank()
        if size <= 3:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn(
                "Skip {} because it only supports test over at least 3 nodes".
                format(fname))
            return

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU and not bf.nccl_built():
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        bf.set_topology(topology_util.RingGraph(size))
        bf.turn_on_win_ops_with_associated_p()
        for dtype, send_rank in itertools.product(dtypes, range(size)):
            tensor = torch.FloatTensor([23]).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_asscoicate_with_p_{}_{}".format(
                dtype, send_rank)
            bf.win_create(tensor, window_name)
            left_neighbor_rank = (send_rank - 1) % size
            right_neighbor_rank = (send_rank + 1) % size
            if rank == send_rank:
                bf.win_accumulate(tensor,
                                  name=window_name,
                                  self_weight=0.5,
                                  dst_weights={
                                      left_neighbor_rank: 0.5,
                                      right_neighbor_rank: 0.5
                                  })
            bf.barrier()
            bf.win_update_then_collect(name=window_name)
            associated_p = bf.win_associated_p(name=window_name)
            if rank == send_rank:
                assert associated_p == 0.5, (
                    "associated_p for sender {} is wrong. Get {}".format(
                        rank, associated_p))
            elif (rank == left_neighbor_rank) or (rank == right_neighbor_rank):
                assert (associated_p - 1.5) < EPSILON, (
                    "associated_p for received neighbor {} is wrong. Get {}".
                    format(rank, associated_p))
            else:
                assert associated_p == 1.0, (
                    "associated_p for untouched node {} is wrong. Get {}".
                    format(rank, associated_p))
        bf.turn_off_win_ops_with_associated_p()
Пример #6
0
 def step(self, closure=None):
     if self.force_barrier:
         bf.barrier()
     # some validation here?
     if self._should_synchronize:
         if self._synchronized:
             warnings.warn(
                 "optimizer.step() called without "
                 "optimizer.skip_synchronize() context after "
                 "optimizer.synchronize(). This can cause training "
                 "slowdown. You may want to consider using "
                 "optimizer.skip_synchronize() context if you use "
                 "optimizer.synchronize() in your code.")
         self.synchronize()
     self._synchronized = False
     return super(self.__class__, self).step(closure)
Пример #7
0
    def test_win_put_with_varied_tensor_elements(self):
        """Test that the window put operation."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        neighbor_ranks = [(rank - 2**i) % size
                          for i in range(indegree)]  # in-neighbor
        avg_value = (rank + np.sum(neighbor_ranks)) / float(indegree + 1)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            base_tensor = torch.arange(
                DIM_SIZE**dim, dtype=torch.float32).view_as(tensor).div(1000)
            tensor = self.cast_and_place(tensor, dtype)
            base_tensor = self.cast_and_place(base_tensor, dtype)
            tensor = tensor + base_tensor
            window_name = "win_put_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)

            bf.win_put(tensor, window_name)
            bf.barrier()
            sync_result = bf.win_update(window_name)
            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update after win_put produces wrong shape tensor.")
            assert (
                (sync_result - base_tensor).data -
                avg_value).abs().max() < EPSILON, (
                    "bf.win_update after win_put produces wrong tensor value "
                    + "[{}-{}]!={} at rank {}.".format(
                        (sync_result - base_tensor).min(),
                        (sync_result - base_tensor).max(), avg_value, rank))

        time.sleep(0.5)
        for dtype, dim in itertools.product(dtypes, dims):
            window_name = "win_put_{}_{}".format(dim, dtype)
            is_freed = bf.win_free(window_name)
            assert is_freed, "bf.win_free do not free window object successfully."
Пример #8
0
    def test_asscoicated_with_p_random_test(self):
        size = bf.size()
        rank = bf.rank()
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        # Current, nccl version hasn't supported the associated with p yet.
        if TEST_ON_GPU and not bf.nccl_built():
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1]
        bf.turn_on_win_ops_with_associated_p()
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([23] * dim)).fill_(1)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_asscoicate_with_p_random_{}_{}".format(
                dim, dtype)
            bf.win_create(tensor, window_name, zero_init=True)
            for _ in range(10):
                random_weights = np.random.rand(
                    len(bf.out_neighbor_ranks()) + 1)
                random_weights /= random_weights.sum()
                self_weight = random_weights[-1]
                dst_weights = {
                    r: random_weights[i]
                    for i, r in enumerate(bf.out_neighbor_ranks())
                }
                bf.win_put(tensor,
                           self_weight=self_weight,
                           dst_weights=dst_weights,
                           name=window_name,
                           require_mutex=True)
                bf.win_update(name=window_name, require_mutex=True)
                bf.win_accumulate(tensor,
                                  name=window_name,
                                  require_mutex=True,
                                  self_weight=self_weight,
                                  dst_weights=dst_weights)
                bf.win_update_then_collect(name=window_name)
            bf.barrier()
            bf.win_update_then_collect(name=window_name)
            associated_p = bf.win_associated_p(name=window_name)
            # Because the associated p should operate the same as tensor always
            # the following assert should be true no matter what order is excuted.
            assert abs(associated_p - tensor.data[0]) < EPSILON

        bf.turn_off_win_ops_with_associated_p()
Пример #9
0
    def test_win_put_with_given_destination(self):
        """Test that the window put operation with given destination."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # By default, we use exponential two ring topology.
        indegree = int(np.ceil(np.log2(size)))
        # We use given destination to form a (right-)ring.
        avg_value = (rank * indegree + 1.23 *
                     ((rank - 1) % size)) / float(indegree + 1)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_put_given_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            bf.win_put(tensor,
                       window_name,
                       dst_weights={(rank + 1) % size: 1.23})
            bf.barrier()
            sync_result = bf.win_update(window_name)
            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update after win_put given destination produces wrong shape tensor."
            )
            assert (sync_result.data - avg_value).abs().max() < EPSILON, (
                "bf.win_update after win_put given destination produces wrong tensor value "
                + "[{}-{}]!={} at rank {}.".format(
                    sync_result.min(), sync_result.max(), avg_value, rank))

        time.sleep(0.5)
        for dtype, dim in itertools.product(dtypes, dims):
            window_name = "win_put_given_{}_{}".format(dim, dtype)
            is_freed = bf.win_free(window_name)
            assert is_freed, "bf.win_free do not free window object successfully."
Пример #10
0
    def test_win_accumulate_with_given_destination(self):
        """Test that the window accumulate operation with given destination."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        avg_value = rank + ((rank - 1) % size) * 1.23 / 2.0

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_accumulate_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            bf.win_accumulate(tensor,
                              window_name,
                              dst_weights={(rank + 1) % size: 1.23})

            bf.barrier()
            sync_result = bf.win_update(window_name,
                                        self_weight=0.5,
                                        neighbor_weights={
                                            (rank - 1) % size: 0.5
                                        })

            assert (list(sync_result.shape) == [DIM_SIZE] * dim), (
                "bf.win_update after win_accmulate given destination produces wrong shape tensor."
            )
            assert (sync_result.data - avg_value).abs().max() < EPSILON, (
                "bf.win_update after win_accmulate given destination produces wrong tensor value "
                + "[{}-{}]!={} at rank {}.".format(
                    sync_result.min(), sync_result.max(), avg_value, rank))
Пример #11
0
    def test_win_get_with_given_sources(self):
        """Test that the window get operation with given sources."""
        size = bf.size()
        rank = bf.rank()
        if size <= 1:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn("Skip {} due to size 1".format(fname))
            return
        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        # We use given destination to form a (right-)ring.
        avg_value = (rank + 1.23 * ((rank - 1) % size)) / float(2)

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([DIM_SIZE] * dim)).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_get_given_{}_{}".format(dim, dtype)
            bf.win_create(tensor, window_name)
            bf.win_get(window_name, src_weights={(rank - 1) % size: 1.23})
            bf.barrier()
            recv_tensor = bf.win_update(window_name,
                                        self_weight=0.5,
                                        neighbor_weights={
                                            (rank - 1) % size: 0.5
                                        },
                                        clone=True)

            assert (list(recv_tensor.shape) == [DIM_SIZE] * dim), (
                "bf.win_get with given sources produces wrong shape tensor.")
            assert (recv_tensor.data - avg_value).abs().max() < EPSILON, (
                "bf.win_get with given sources produces wrong tensor value " +
                "[{}-{}]!={} at rank {}.".format(
                    recv_tensor.min(), recv_tensor.max(), avg_value, rank))
Пример #12
0
    def test_win_mutex_given_ranks(self):
        size = bf.size()
        rank = bf.rank()
        if size < 4:
            fname = inspect.currentframe().f_code.co_name
            warnings.warn(
                "Skip {} because it only supports test above 4 nodes".format(
                    fname))
            return

        dtypes = [torch.FloatTensor, torch.DoubleTensor]
        if TEST_ON_GPU:
            dtypes += [torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

        for dtype in dtypes:
            tensor = torch.FloatTensor([DIM_SIZE]).fill_(1).mul_(rank)
            tensor = self.cast_and_place(tensor, dtype)
            window_name = "win_mutex_given_ranks_{}".format(dtype)
            bf.win_create(tensor, window_name)
            if rank == 0:
                with bf.win_mutex(window_name, for_self=True, ranks=[1]):
                    bf.barrier()
                    time.sleep(1.01)
            elif rank == 1:
                bf.barrier()
                t_start = time.time()
                with bf.win_mutex(window_name, ranks=[0]):
                    time.sleep(0.001)
                t_end = time.time()
                assert (t_end - t_start) > 1
            elif rank == 2:
                bf.barrier()
                t_start = time.time()
                with bf.win_mutex(window_name, ranks=[0]):
                    time.sleep(0.001)
                t_end = time.time()
                assert (t_end - t_start) < 0.1
            else:
                bf.barrier()
Пример #13
0
                }
                self_weight = 1 / (1 + outdegree)

            bf.win_accumulate(x,
                              name="x",
                              self_weight=self_weight,
                              dst_weights=dst_weights,
                              require_mutex=True)
            bf.win_update_then_collect(name="x")
            associated_p = bf.win_associated_p(name="x")
            mse.append(
                torch.norm(x / associated_p - x_bar, p=2) /
                torch.norm(x_bar, p=2))

        # Do not forget to sync at last!
        bf.barrier()
        bf.win_update_then_collect(name="x")
        associated_p = bf.win_associated_p(name="x")
        print(f"associated p at {bf.rank()} is {associated_p}")
        bf.turn_off_win_ops_with_associated_p()
        mse.append(
            torch.norm(x / associated_p - x_bar, p=2) / torch.norm(x_bar, p=2))
        bf.win_free(name="x")
    else:
        p = torch.DoubleTensor([1.0]).to(x.device)
        x_ext = torch.cat([x, p], 0)
        bf.win_create(x_ext, name="x_ext", zero_init=True)
        for i in range(args.max_iters):
            if args.enable_dynamic_topology:
                num_out_neighbors = len(bf.out_neighbor_ranks())
                sent_neighbor = bf.out_neighbor_ranks()[i % num_out_neighbors]
Пример #14
0
def push_diging(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs):

    if loss == 'logistic_regression':
        rho = kwargs.get('rho', 1e-1)
    elif loss == 'linear_regression':
        rho = 0
    else:
        raise NotImplementedError(
            'Task not supported. This example only supports' +
            ' linear_regression and logistic_regression')

    outdegree = len(bf.out_neighbor_ranks())
    indegree = len(bf.in_neighbor_ranks())

    # We let w = col{u, y, v}, i.e., u, y, v = w[:n], w[n:2*n], w[2n]
    # Insteady of three directed_neighbor_allreduce operations for u, y,
    # and v respectively, we exploit one directed_neighbor_allreduce for
    # the combo vector w. This guarantees u, y, and v to be transmitted
    # simultanesly and avoids the mismatch between them. Experiments
    # show directed_neighbor_allreduce(w) is crutial for convergence of
    # push_diging.
    w = torch.zeros(2 * n + 1, 1).to(torch.double)
    x = torch.zeros(n, 1, dtype=torch.double, requires_grad=True)
    loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho)

    grad = x.grad.data.clone()
    w[n:2 * n] = grad
    x.grad.data.zero_()

    w[-1] = 1.0
    grad_prev = w[n:2 * n].clone()

    bf.win_create(w, name="w_buff", zero_init=True)

    mse = []
    for _ in range(maxite):
        bf.barrier()

        w[:n] = w[:n] - alpha * w[n:2 * n]
        bf.win_accumulate(w,
                          name="w_buff",
                          dst_weights={
                              rank: 1.0 / (outdegree * 2)
                              for rank in bf.out_neighbor_ranks()
                          },
                          require_mutex=True)
        w.div_(2)
        bf.barrier()

        w = bf.win_update_then_collect(name="w_buff")

        x.data = w[:n] / w[-1]
        loss_step(X, y, x, tensor_name='w_buff', loss=loss, rho=rho)
        grad = x.grad.data.clone()
        x.grad.data.zero_()

        w[n:2 * n] += grad - grad_prev
        grad_prev = grad
        if bf.rank() == 0:
            mse.append(torch.norm(x.data - w_opt, p=2))

    bf.barrier()
    w = bf.win_update_then_collect(name="w_buff")
    x.data = w[:n] / w[-1]

    return x, mse