def _allreduce_factors(self):
        """Allreduce the factors for all layers"""
        handles = []

        for m in self.modules:
            handles.append(hvd.allreduce_async_(self.m_A[m].data, op=hvd.Average))
            handles.append(hvd.allreduce_async_(self.m_G[m].data, op=hvd.Average))

        for handle in handles:
            hvd.synchronize(handle)
예제 #2
0
    def allreduce_async_(self, name, tensor, op=hvd.Average):
        self.op = op
        if self.merge:
            if self.symmetric:
                upper_indices = torch.triu_indices(tensor.shape[0],
                                                   tensor.shape[0],
                                                   device=tensor.device)
                comm_tensor = tensor[upper_indices[0], upper_indices[1]]
            else:
                comm_tensor = tensor
            if self.fp16:
                if self.residual:
                    if name not in self._residuals:
                        self._residuals[name] = comm_tensor.new_zeros(
                            comm_tensor.shape)
                    comm_tensor.add_(self._residuals[name])
                half_tensor = comm_tensor.half()
                if self.residual:
                    self._residuals[name] = comm_tensor - half_tensor
                comm_tensor = half_tensor
            self._name_tensors[name] = (tensor, comm_tensor)
            new_name, new_tensor = self._tensor_group.push_tensor(
                name, comm_tensor)
            if new_tensor is not None:
                current_stream = torch.cuda.current_stream()
                current_stream.synchronize()

                handle = hvd.allreduce_async_(new_tensor,
                                              op=hvd.Sum,
                                              name=self.prefix + new_name)
                self.handles.append(handle)
        else:
            if self.symmetric:
                upper_indices = torch.triu_indices(tensor.shape[0],
                                                   tensor.shape[0],
                                                   device=tensor.device)
                comm_tensor = tensor[upper_indices[0], upper_indices[1]]
            else:
                comm_tensor = tensor
            if self.fp16:
                if self.residual:
                    if name not in self._residuals:
                        self._residuals[name] = comm_tensor.new_zeros(
                            comm_tensor.shape)
                    comm_tensor.add_(self._residuals[name])
                half_tensor = comm_tensor.half()
                if self.residual:
                    self._residuals[name] = comm_tensor - half_tensor
                comm_tensor = half_tensor  #comm_tensor.half()
                #comm_tensor = comm_tensor.bfloat16()
            self._name_tensors[name] = (tensor, comm_tensor)
            handle = hvd.allreduce_async_(comm_tensor, op=hvd.Sum)
            self.handles.append(handle)
예제 #3
0
    def _reduce_factors(self, eigen_ranks):
        """Allreduce the factors for all layers"""
        handles = []

        for m in self.modules:
            name = self.module_name_map[m]
            ranks_a, ranks_g = eigen_ranks[m]
            rank_a = ranks_a[0]
            rank_g = ranks_g[0]
            handles.append(hvd.allreduce_async_(self.m_A[m].data, op=hvd.Average))
            handles.append(hvd.allreduce_async_(self.m_G[m].data, op=hvd.Average))

        for handle in handles:
            hvd.synchronize(handle)
    def _allreduce_eigendecomp(self):
        """Allreduce the eigendecompositions for all layers

        Note: we use `op=hvd.Sum` to simulate an allgather`. Each rank will
        either compute the eigendecomposition for a factor or just return
        zeros so we sum instead of averaging.
        """
        handles = []

        for m in self.modules:
            handles.append(hvd.allreduce_async_(self.m_QA[m].data, op=hvd.Sum))
            handles.append(hvd.allreduce_async_(self.m_QG[m].data, op=hvd.Sum))

        for handle in handles:
            hvd.synchronize(handle)
 def delayedupdate(self, val):
     if self.handle is None:
         self.sum += 0
     else:
         self.sum += hvd.synchronize(self.handle)
     self.handle = hvd.allreduce_async_(val.detach().cpu(), name=self.name)
     self.n += 1
예제 #6
0
 def async_send(self, tensors_compressed, name):
     handles = []
     for i, tensor_compressed in enumerate(tensors_compressed):
         handles.append(
             allreduce_async_(tensor_compressed, self.compressor.average,
                              name + str(i)))
     return handles
예제 #7
0
def allreduce_parameters(params):
    handles = []
    if isinstance(params, dict):
        params = sorted(params.items())
    elif isinstance(params, list):
        # support both named_parameters() and regular parameters()
        params = [p if isinstance(p, tuple) else (None, p) for p in params]
    else:
        raise ValueError('invalid params of type: %s' % type(params))

    # Run asynchronous broadcasts.
    handles = []
    for name, p in params:
        handle = hvd.allreduce_async_(p, average=True, name=name)
        handles.append(handle)

    # Wait for completion.
    for handle in handles:
        hvd.synchronize(handle)
예제 #8
0
def maybe_allreduce_grads(model):
    if hvd.size() > 1:
        tstart_reduce = time.time()
        named_parameters = list(
            sorted(model.named_parameters(), key=lambda a: a[0]))
        grad_handles = []
        for name, p in named_parameters:
            if p.requires_grad:
                if p.grad is None:
                    p.grad = torch.zeros_like(p)
                with torch.no_grad():
                    grad_handles.append(hvd.allreduce_async_(p.grad,
                                                             name=name))
        for handle in grad_handles:
            hvd.synchronize(handle)
        tlogger.record_tabular("TimeElapsedAllReduce",
                               time.time() - tstart_reduce)
        if time.time() - tstart_reduce > 5:
            import socket
            tlogger.info(
                "Allreduce took more than 5 seconds for node {} (rank {})".
                format(socket.gethostname(), hvd.rank()))
예제 #9
0
def get_eigen(model,
              inputs,
              targets,
              criterion,
              maxIter=50,
              tol=1e-3,
              comm=True):
    """
    compute the top eigenvalues of model parameters and
    the corresponding eigenvectors.

    change the model to evaluation mode, otherwise the batch Normalization Layer will change.
    If you call this functino during training, remember to change the mode back to training mode.
    model.eval()
    """

    model.eval()
    # torch.no_grad()

    #model_copy = squeezenet1_1(pretrained=False)
    #model_copy.load_state_dict(model.state_dict())
    #optimizer = optim.SGD(model_copy.parameters(), lr=0.001 * hvd.size(), momentum=0.9)
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward(create_graph=True)

    params, gradsH = get_params_grad(model)
    v = [torch.randn(p.size()) for p in params]
    v = normalization(v)
    if comm:
        hvd.broadcast_parameters(v, root_rank=0)

    eigenvalue = None

    for i in range(maxIter):
        print(i)
        model.zero_grad()
        Hv = hessian_vector_product(gradsH, params, v)
        if comm:
            handles = []
            for i in range(len(Hv)):
                handles.append(
                    hvd.allreduce_async_(
                        Hv[i],
                        name='reduce random vector update {}'.format(i)))
            for handle in handles:
                hvd.synchronize(handle)
        eigenvalue_tmp = group_product(Hv, v).item()
        v = normalization(Hv)
        if eigenvalue == None:
            eigenvalue = eigenvalue_tmp
        else:
            if abs(eigenvalue - eigenvalue_tmp) < tol:
                if comm:
                    return eigenvalue_tmp, v
            else:
                eigenvalue = eigenvalue_tmp
    if not comm:
        print("{} is here".format(hvd.rank()))
        eigenvalue = torch.FloatTensor([eigenvalue])
        hvd.allreduce_(eigenvalue, name='eigenvalue')
        print("allreduced eigs for rank {}".format(hvd.rank()))
        eigenvalue = float(eigenvalue)
        if hvd.rank() == 0:
            print("No Communication eigenvalue approximated at {}".format(
                eigenvalue))
    return eigenvalue, v
예제 #10
0
    #p_g = torch.randn(256,256,3,3).cuda()
    #p_g = torch.randn(128,128,3,3).cuda()
    data_size = []
    bwd = []

    handles = {}
    for i in range(18):
        size = 1024 * (2**i)
        p_g = torch.randn(size).cuda()
        torch.cuda.synchronize()
        begin_time = time.time()
        rept = 10
        if i < 10:
            rept = 10000
        for i in range(rept):
            handle = hvd.allreduce_async_(p_g, average=False)
            handles[i] = handle
        for i in range(rept):
            synchronize(handles[i])
        torch.cuda.synchronize()
        end_time = time.time()
        if hvd.local_rank() == 0:
            print(size * 4 / 1024, "KB")
            #print('allreduce time, ', (end_time - begin_time) / rept)
            bandwidth = size * 4 * rept / (end_time - begin_time) / 1e9
            print('allreduce bandwidth, ', bandwidth)
            data_size.append(size)
            bwd.append(bandwidth)
    print(data_size)
    print(bwd)
    exit(0)