Python allreduce_async_示例，horovod.torch.allreduce_async_ Python示例

示例#1

0

显示文件

文件： kfac_preconditioner_inv_subsampling.py 项目： shyhuai/kfac_pytorch

    def _allreduce_factors(self):
        """Allreduce the factors for all layers"""
        handles = []

        for m in self.modules:
            handles.append(hvd.allreduce_async_(self.m_A[m].data, op=hvd.Average))
            handles.append(hvd.allreduce_async_(self.m_G[m].data, op=hvd.Average))

        for handle in handles:
            hvd.synchronize(handle)

示例#2

0

显示文件

    def allreduce_async_(self, name, tensor, op=hvd.Average):
        self.op = op
        if self.merge:
            if self.symmetric:
                upper_indices = torch.triu_indices(tensor.shape[0],
                                                   tensor.shape[0],
                                                   device=tensor.device)
                comm_tensor = tensor[upper_indices[0], upper_indices[1]]
            else:
                comm_tensor = tensor
            if self.fp16:
                if self.residual:
                    if name not in self._residuals:
                        self._residuals[name] = comm_tensor.new_zeros(
                            comm_tensor.shape)
                    comm_tensor.add_(self._residuals[name])
                half_tensor = comm_tensor.half()
                if self.residual:
                    self._residuals[name] = comm_tensor - half_tensor
                comm_tensor = half_tensor
            self._name_tensors[name] = (tensor, comm_tensor)
            new_name, new_tensor = self._tensor_group.push_tensor(
                name, comm_tensor)
            if new_tensor is not None:
                current_stream = torch.cuda.current_stream()
                current_stream.synchronize()

                handle = hvd.allreduce_async_(new_tensor,
                                              op=hvd.Sum,
                                              name=self.prefix + new_name)
                self.handles.append(handle)
        else:
            if self.symmetric:
                upper_indices = torch.triu_indices(tensor.shape[0],
                                                   tensor.shape[0],
                                                   device=tensor.device)
                comm_tensor = tensor[upper_indices[0], upper_indices[1]]
            else:
                comm_tensor = tensor
            if self.fp16:
                if self.residual:
                    if name not in self._residuals:
                        self._residuals[name] = comm_tensor.new_zeros(
                            comm_tensor.shape)
                    comm_tensor.add_(self._residuals[name])
                half_tensor = comm_tensor.half()
                if self.residual:
                    self._residuals[name] = comm_tensor - half_tensor
                comm_tensor = half_tensor  #comm_tensor.half()
                #comm_tensor = comm_tensor.bfloat16()
            self._name_tensors[name] = (tensor, comm_tensor)
            handle = hvd.allreduce_async_(comm_tensor, op=hvd.Sum)
            self.handles.append(handle)

示例#3

0

显示文件

    def _reduce_factors(self, eigen_ranks):
        """Allreduce the factors for all layers"""
        handles = []

        for m in self.modules:
            name = self.module_name_map[m]
            ranks_a, ranks_g = eigen_ranks[m]
            rank_a = ranks_a[0]
            rank_g = ranks_g[0]
            handles.append(hvd.allreduce_async_(self.m_A[m].data, op=hvd.Average))
            handles.append(hvd.allreduce_async_(self.m_G[m].data, op=hvd.Average))

        for handle in handles:
            hvd.synchronize(handle)

示例#4

0

显示文件

文件： kfac_preconditioner_small_ag.py 项目： shyhuai/kfac_pytorch

    def _allreduce_eigendecomp(self):
        """Allreduce the eigendecompositions for all layers

        Note: we use `op=hvd.Sum` to simulate an allgather`. Each rank will
        either compute the eigendecomposition for a factor or just return
        zeros so we sum instead of averaging.
        """
        handles = []

        for m in self.modules:
            handles.append(hvd.allreduce_async_(self.m_QA[m].data, op=hvd.Sum))
            handles.append(hvd.allreduce_async_(self.m_QG[m].data, op=hvd.Sum))

        for handle in handles:
            hvd.synchronize(handle)

示例#5

0

显示文件

文件： pytorch_cifar_vgg16.py 项目： wdlctc/horovod_delayedSGD

 def delayedupdate(self, val):
     if self.handle is None:
         self.sum += 0
     else:
         self.sum += hvd.synchronize(self.handle)
     self.handle = hvd.allreduce_async_(val.detach().cpu(), name=self.name)
     self.n += 1

示例#6

0

显示文件

 def async_send(self, tensors_compressed, name):
     handles = []
     for i, tensor_compressed in enumerate(tensors_compressed):
         handles.append(
             allreduce_async_(tensor_compressed, self.compressor.average,
                              name + str(i)))
     return handles

示例#7

0

显示文件

def allreduce_parameters(params):
    handles = []
    if isinstance(params, dict):
        params = sorted(params.items())
    elif isinstance(params, list):
        # support both named_parameters() and regular parameters()
        params = [p if isinstance(p, tuple) else (None, p) for p in params]
    else:
        raise ValueError('invalid params of type: %s' % type(params))

    # Run asynchronous broadcasts.
    handles = []
    for name, p in params:
        handle = hvd.allreduce_async_(p, average=True, name=name)
        handles.append(handle)

    # Wait for completion.
    for handle in handles:
        hvd.synchronize(handle)

示例#8

0

显示文件

def maybe_allreduce_grads(model):
    if hvd.size() > 1:
        tstart_reduce = time.time()
        named_parameters = list(
            sorted(model.named_parameters(), key=lambda a: a[0]))
        grad_handles = []
        for name, p in named_parameters:
            if p.requires_grad:
                if p.grad is None:
                    p.grad = torch.zeros_like(p)
                with torch.no_grad():
                    grad_handles.append(hvd.allreduce_async_(p.grad,
                                                             name=name))
        for handle in grad_handles:
            hvd.synchronize(handle)
        tlogger.record_tabular("TimeElapsedAllReduce",
                               time.time() - tstart_reduce)
        if time.time() - tstart_reduce > 5:
            import socket
            tlogger.info(
                "Allreduce took more than 5 seconds for node {} (rank {})".
                format(socket.gethostname(), hvd.rank()))

示例#9

0

显示文件

def get_eigen(model,
              inputs,
              targets,
              criterion,
              maxIter=50,
              tol=1e-3,
              comm=True):
    """
    compute the top eigenvalues of model parameters and
    the corresponding eigenvectors.

    change the model to evaluation mode, otherwise the batch Normalization Layer will change.
    If you call this functino during training, remember to change the mode back to training mode.
    model.eval()
    """

    model.eval()
    # torch.no_grad()

    #model_copy = squeezenet1_1(pretrained=False)
    #model_copy.load_state_dict(model.state_dict())
    #optimizer = optim.SGD(model_copy.parameters(), lr=0.001 * hvd.size(), momentum=0.9)
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward(create_graph=True)

    params, gradsH = get_params_grad(model)
    v = [torch.randn(p.size()) for p in params]
    v = normalization(v)
    if comm:
        hvd.broadcast_parameters(v, root_rank=0)

    eigenvalue = None

    for i in range(maxIter):
        print(i)
        model.zero_grad()
        Hv = hessian_vector_product(gradsH, params, v)
        if comm:
            handles = []
            for i in range(len(Hv)):
                handles.append(
                    hvd.allreduce_async_(
                        Hv[i],
                        name='reduce random vector update {}'.format(i)))
            for handle in handles:
                hvd.synchronize(handle)
        eigenvalue_tmp = group_product(Hv, v).item()
        v = normalization(Hv)
        if eigenvalue == None:
            eigenvalue = eigenvalue_tmp
        else:
            if abs(eigenvalue - eigenvalue_tmp) < tol:
                if comm:
                    return eigenvalue_tmp, v
            else:
                eigenvalue = eigenvalue_tmp
    if not comm:
        print("{} is here".format(hvd.rank()))
        eigenvalue = torch.FloatTensor([eigenvalue])
        hvd.allreduce_(eigenvalue, name='eigenvalue')
        print("allreduced eigs for rank {}".format(hvd.rank()))
        eigenvalue = float(eigenvalue)
        if hvd.rank() == 0:
            print("No Communication eigenvalue approximated at {}".format(
                eigenvalue))
    return eigenvalue, v

示例#10

0

显示文件

文件： test_bwd.py 项目： feifeibear/89757

    #p_g = torch.randn(256,256,3,3).cuda()
    #p_g = torch.randn(128,128,3,3).cuda()
    data_size = []
    bwd = []

    handles = {}
    for i in range(18):
        size = 1024 * (2**i)
        p_g = torch.randn(size).cuda()
        torch.cuda.synchronize()
        begin_time = time.time()
        rept = 10
        if i < 10:
            rept = 10000
        for i in range(rept):
            handle = hvd.allreduce_async_(p_g, average=False)
            handles[i] = handle
        for i in range(rept):
            synchronize(handles[i])
        torch.cuda.synchronize()
        end_time = time.time()
        if hvd.local_rank() == 0:
            print(size * 4 / 1024, "KB")
            #print('allreduce time, ', (end_time - begin_time) / rept)
            bandwidth = size * 4 * rept / (end_time - begin_time) / 1e9
            print('allreduce bandwidth, ', bandwidth)
            data_size.append(size)
            bwd.append(bandwidth)
    print(data_size)
    print(bwd)
    exit(0)