Пример #1
0
def reduce():
    rank = hvd.rank()
    local_rank = hvd.local_rank()
    size = hvd.size()
    torch.cuda.set_device(local_rank)
    nstreams = 1
    communicator = tcmm.Communicator(rank, size, nstreams)
    n_elements = 32 * 1024
    iterations = 100
    tensor = torch.rand(n_elements).cuda()
    if rank == 0:
        print('before rank: %d' % rank, time.time())
    for i in range(nstreams):
        communicator.reduce(tensor, 0)
    #communicator.allReduce(tensor)
    #hvd.allreduce(tensor)
    communicator.synchronize()
    start = time.time()
    previous = start
    for i in range(iterations):
        communicator.reduce(tensor, 0)
        #communicator.allReduce(tensor)
        #hvd.allreduce(tensor)
        current = time.time()
        if rank == 0:
            print('i: ', i, current - previous)
        previous = current
    communicator.synchronize()
    end = time.time()
    if rank == 0:
        print('after rank: %d' % rank, time.time(), (end - start) / iterations)
        print('throughput: ',
              n_elements * 4 * 1e-9 / ((end - start) / iterations), 'GB/s')
Пример #2
0
 def __init__(self, symmetric=False, fp16=False):
     self.handles = []
     self.symmetric = symmetric
     self.fp16 = fp16 # dosen't support fp16 at the current stage
     self.merged_tensors = {}
     nstreams = 1
     self.merged_comm = tcmm.Communicator(hvd.rank(), hvd.size(), nstreams)
Пример #3
0
def allreduce():
    rank = hvd.rank()
    local_rank = hvd.local_rank()
    size = hvd.size()
    torch.cuda.set_device(local_rank)
    communicator = tcmm.Communicator(rank, size)
    tensor = torch.rand(2).cuda()
    print('before rank: %d' % rank, tensor)
    communicator.allReduce(tensor)
    print('after rank: %d' % rank, tensor)
Пример #4
0
    def __init__(self, tensor_names=None, prefix='flag', merge=False, single_layer=False, symmetric=False, fp16=False):
        self._tensor_names = tensor_names
        self.merge = merge
        self.single_layer = single_layer
        self.symmetric = symmetric
        self.prefix = prefix
        self.fp16 = fp16
        self.tensor_group_names = None
        if tensor_names is not None:
            self.init_tensor_group(tensor_names)
        nstreams = 1
        self.merged_comm = tcmm.Communicator(hvd.rank(), hvd.size(), nstreams)

        self._name_tensors = {}
        self.handles = []
Пример #5
0
def benchmark_custom_comm():
    torch.cuda.set_device(hvd.local_rank())
    merged_comm = tcmm.Communicator(hvd.rank(), hvd.size(), 1)
    comm_op = merged_comm.reduce 
    sync_op = merged_comm.synchronize
    sizes = [2**i for i in range(10, 11)]
    #sizes = [] #[1024*i for i in range(1, 1024)] 
    large_sizes = [] #[1024*1024*i for i in range(1, 513)] # 1M to 512M
    sizes += large_sizes
    profiler = CommunicationProfiler(comm_op, sync_op, sizes)
    for root in range(hvd.size()):
        sizes, times = profiler.benchmark(root, num_iters=50)
        if hvd.rank() == 0:
            print('root: %d' % root)
            for s, t in zip(sizes, times):
                print(s, t, str(s*4/t*1e-6)+' MB/s')
            print()
Пример #6
0
def multi_bcast():
    rank = hvd.rank()
    local_rank = hvd.local_rank()
    size = hvd.size()
    torch.cuda.set_device(local_rank)
    communicator = tcmm.Communicator(rank, size)
    ntensors = 2
    tensors = []
    for i in range(ntensors):
        t = torch.rand(2).cuda()
        tensors.append(t)

    def _op(tensor):
        tensor.mul_(2)
        return None

    print('before rank: %d' % rank, tensors)
    communicator.multiBcast(tensors, _op)
    print('after rank: %d' % rank, tensors)
Пример #7
0
def bench_customize_comm():
    import horovod.torch as hvd
    torch.random.manual_seed(10)
    hvd.init()
    rank = hvd.rank()
    local_rank = hvd.local_rank()
    size = hvd.size()
    torch.cuda.set_device(local_rank)

    logfile = './logs/resnet50-matrixsize-A.log'
    workloads = reader.read_tensor_sizes(logfile)
    tensors = []
    outputs = []
    for w in workloads:
        n = w[0]
        a = torch.rand(n).float().cuda()
        a = a.view(-1, a.size(-1))
        A = a.t() @ (a)
        tensors.append(A)
        outputs.append(A.new_zeros(A.shape))

        communicator = tcmm.Communicator(rank, size)
    warmup = 5
    niters = 10
    for i in range(warmup):
        communicator.multiBcast(tensors, outputs, compute_eigen)
        communicator.synchronize()
    torch.cuda.synchronize()

    stime = time.time()
    for i in range(niters):
        communicator.multiBcast(tensors, outputs, compute_eigen)
        communicator.synchronize()
        torch.cuda.synchronize()
    etime = time.time()
    print('Avg time: ', (etime - stime) / niters)
Пример #8
0
    def __init__(self,
                 model,
                 lr=0.1,
                 hook_enabled=True,
                 factor_decay=0.95,
                 damping=0.001,
                 kl_clip=0.001,
                 fac_update_freq=10,
                 kfac_update_freq=100,
                 batch_averaged=True,
                 diag_blocks=1,
                 diag_warmup=0,
                 distribute_layer_factors=None,
                 sparse=False,
                 sparse_ratio=0.01,
                 exclude_parts=''):
        #exclude_parts='CommunicateInverse,ComputeInverse,CommunicateFactor,ComputeFactor'):

        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 < factor_decay <= 1:
            raise ValueError(
                "Invalid factor decay rate: {}".format(factor_decay))
        if not 0.0 < damping:
            raise ValueError("Invalid damping: {}".format(damping))
        if not 0.0 < kl_clip:
            raise ValueError("Invalid clipping value: {}".format(kl_clip))
        if not 0 < fac_update_freq:
            raise ValueError(
                "Invalid factor update frequency: {}".format(fac_update_freq))
        if not 0 < kfac_update_freq:
            raise ValueError(
                "Invalid K-FAC update frequency: {}".format(kfac_update_freq))
        if not 0 == kfac_update_freq % fac_update_freq:
            print(
                "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq"
            )
        if not 0 < diag_blocks:
            raise ValueError(
                "Invalid diagonal block approx count: {}".format(diag_blocks))
        if not 0 <= diag_blocks:
            raise ValueError(
                "Invalid diagonal block approx count: {}".format(diag_blocks))
        if not 1 == diag_blocks:
            print(
                "WARNING: diag_blocks > 1 is experimental and may give poor results."
            )

        # For compatibility with `KFACParamScheduler`
        defaults = dict(lr=lr,
                        damping=damping,
                        fac_update_freq=fac_update_freq,
                        kfac_update_freq=kfac_update_freq)

        super(KFAC, self).__init__(model.parameters(), defaults)

        self.computeA = ComputeA()
        self.computeG = ComputeG()
        self.known_modules = {'Linear', 'Conv2d'}
        self.modules = []
        self.module_names = []
        # register hooks for known modules
        self.hook_enabled = hook_enabled
        self._register_modules(model)

        # tcmm communicator
        self.communicator = tcmm.Communicator(hvd.rank(), hvd.size(), 1)

        self.steps = 0

        # Dictionaries keyed by `module` to storing the factors and inverse factors
        self.m_a, self.m_g = {}, {}
        self.m_A, self.m_G = {}, {}
        self.m_inv_A, self.m_inv_G = {}, {}
        self.module_ranks = None

        self.sparse = sparse
        self.sparse_ratio = sparse_ratio
        self.residualsA, self.residualsG = {}, {}

        self.factor_decay = factor_decay
        self.kl_clip = kl_clip
        self.fac_update_freq = fac_update_freq
        self.kfac_update_freq = kfac_update_freq
        self.diag_blocks = diag_blocks
        self.diag_warmup = diag_warmup
        self.batch_averaged = batch_averaged

        self.exclude_communicate_inverse = True if exclude_parts.find(
            'CommunicateInverse') >= 0 else False
        self.exclude_compute_inverse = True if exclude_parts.find(
            'ComputeInverse') >= 0 else False
        self.exclude_communicate_factor = True if exclude_parts.find(
            'CommunicateFactor') >= 0 else False
        self.exclude_compute_factor = True if exclude_parts.find(
            'ComputeFactor') >= 0 else False

        # Compute ideal value for `distribute_layer_factors` based on
        # registered module count
        if distribute_layer_factors is None:
            self.distribute_layer_factors = True \
                    if hvd.size() > len(self.modules) else False
        else:
            self.distribute_layer_factors = distribute_layer_factors

        self.eps = 1e-10  # for numerical stability
        self.rank_iter = cycle(list(range(hvd.size())))