示例#1
0
def test_2_002(argv):
    nk = pynccl.Nccl()
    nc = nk._nccl  # cuNccl
    api = nc._api  # libnccl

    nuid = pynccl.binding.ncclUniqueId()
    print(nuid)
    print(type(nuid.internal))

    #nuid.internal = pynccl.binding.NcclUniqueId_data_t()
    #nuid.internal = str(create_string_buffer('', 128))
    ##nuid.internal = create_string_buffer('', 128)

    '''
    intnl_str = create_string_buffer('', 128)
    print(intnl_str)

    for i in range(128):
        intnl_str[i] = chr(0)
    for i in range(12):
        intnl_str[i] = 'a'

    intnl_char_p = cast(intnl_str, c_char_p)
    #intnl_char_p = cast(intnl_str, pynccl.binding.NcclUniqueId_data_t)
    nuid.internal = intnl_char_p
    '''

    #for i in range(128):
    #    nuid.internal[i] = 44


    # socket.AF_INET = 2  # 0x0002  ==>  chr(0) + chr(2)
    #nuid.internal = chr(0) + chr(2) + 'nccl-%d-%d' % (os.getpid(), 0)  # TODO: global counter

    intnl_buf = chr(0) + chr(2) + 'nccl-%d-%d' % (os.getpid(), 0)
    intnl_buf += chr(0) * (pynccl.binding.NCCL_UNIQUE_ID_BYTES - len(intnl_buf))
    nuid.internal = intnl_buf

    print(nuid.internal)
    print(len(nuid.internal))
    #print(dir(nuid.internal))

    # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    r = api.ncclGetUniqueId(byref(nuid))  # TODO:
    print('>>> ncclGetUniqueId ', r)
    #print(nuid)
    '''
    print(id(nuid.internal))
    print(type(nuid.internal))
    #print(len(nuid.internal))
    print(nuid.internal)
    '''

    comm = c_void_p(0)
    nRanks = int(argv[1])  #2
    myRank = int(argv[2])  #0

    r = api.ncclCommInitRank(byref(comm), nRanks, nuid, myRank)
    #x#r = api.ncclCommInitRank(byref(comm), nRanks, byref(nuid), myRank)
    print('>>> ncclCommInitRank ', r)
示例#2
0
def test_2_001():
    nk = pynccl.Nccl()
    nc = nk._nccl  # cuNccl
    api = nc._api  # libnccl


    comms_a = c_void_p * 4
    int_a = c_int * 4

    # <1>
    comms = comms_a(0, 0, 0, 0)
    comms_p = cast(comms, POINTER(c_void_p))

    # <2>
    #comms = c_void_p(0)

    devs = int_a(0, 1, 2, 3)
    # <3>
    #devs_p = byref(devs)
    # <4>
    devs_p = cast(devs, POINTER(c_int))

    # <1>
    r = api.ncclCommInitAll(comms_p, 4, devs_p)
    # <2>
    #r = api.ncclCommInitAll(byref(comms), 4, devs_p)

    print(r)

    for i in range(4):
        print(comms[i])
        r = api.ncclCommDestroy(comms[i])
        print(r)
示例#3
0
def test_2_003():
    nk = pynccl.Nccl()
    nc = nk._nccl  # cuNccl
    api = nc._api  # libnccl

    # -------------------------------------

    procs = []

    # -------------------------------------

    comms_a = c_void_p * 4
    int_a = c_int * 4

    # <1>
    comms = comms_a(0, 0, 0, 0)
    comms_p = cast(comms, POINTER(c_void_p))

    # <2>
    #comms = c_void_p(0)

    devs = int_a(0, 1, 2, 3)
    # <3>
    #devs_p = byref(devs)
    # <4>
    devs_p = cast(devs, POINTER(c_int))

    # <1>
    r = api.ncclCommInitAll(comms_p, 4, devs_p)
    # <2>
    #r = api.ncclCommInitAll(byref(comms), 4, devs_p)

    print('>>> ncclCommInitAll ', r)

    #x#cuda.close()

    # -------------------------------------

    for i in range(4):
        worker = mp.Process(
            target=gpu_worker_proc,
            args=(api, i, i, comms[i]))
        worker.daemon = True
        worker.start()
        procs.append(worker)


    for worker in procs:
        worker.join()

    # -------------------------------------

    for i in range(4):
        #print(comms[i])
        r = api.ncclCommDestroy(comms[i])
        print('>>> ncclCommDestroy ', r)
示例#4
0
def gpu_worker_proc_41_2(api, kn, rank, gpu_i, q):

    # NOTE: do this at first of all
    cuda.select_device(gpu_i)

    nk = pynccl.Nccl()
    #nc = nk._nccl  # cuNccl
    #api = nc._api  # libnccl

    pg0 = list(range(kn))
    comm_0 = cre_nccl_comm_fn(nk, q, rank, pg0)

    pg1 = list(range(kn))[1:]
    comm_1 = cre_nccl_comm_fn(nk, q, rank, pg1)

    pg2 = list(range(kn))[2:]
    comm_2 = cre_nccl_comm_fn(nk, q, rank, pg2)

    time.sleep(2)

    nccl_fn_on_comm(nk, comm_0, q, rank, pg0, gpu_i)
    time.sleep(1)

    nccl_fn_on_comm(nk, comm_1, q, rank, pg1, gpu_i)
    time.sleep(1)

    nccl_fn_on_comm(nk, comm_2, q, rank, pg2, gpu_i)
    time.sleep(1)

    r = nk.comm_destroy(comm_0)
    print(rank, '>>> ncclCommDestroy ', r)

    r = nk.comm_destroy(comm_1)
    print(rank, '>>> ncclCommDestroy ', r)

    r = nk.comm_destroy(comm_2)
    print(rank, '>>> ncclCommDestroy ', r)
示例#5
0
def test_2_004():
    nk = pynccl.Nccl()
    nc = nk._nccl  # cuNccl
    api = nc._api  # libnccl

    # -------------------------------------

    procs = []

    q = mp.queues.Queue()

    # -------------------------------------

    for i in range(4):
        worker = mp.Process(
            target=gpu_worker_proc_4,
            args=(api, i, i, q))
        worker.daemon = True
        worker.start()
        procs.append(worker)


    for worker in procs:
        worker.join()
示例#6
0
def gpu_worker_proc_5_2(api, kn, rank, gpu_i, q):

    # NOTE: do this at first of all
    cuda.select_device(gpu_i)

    nk = pynccl.Nccl()
    #nc = nk._nccl  # cuNccl
    #api = nc._api  # libnccl

    if rank == 0:

        nuid = nk.get_unique_id()

        #w = mxutils.mx.nd.array(np.random.random((kn, 5, 10)), dtype=np.float32)  # w
        w = mxutils.mx.nd.array(np.random.random((5, kn * 10)),
                                dtype=np.float32)  # w
        print('w', w)

        for j in range(kn - 1):
            q.put((nuid, w))

    else:
        nuid, w = q.get()
    # -------------------------------------

    x = mxutils.mx.nd.array(np.random.random((7, 5)), dtype=np.float32)

    #arr_send = mxutils.mx.nd.array(np.random.random(5, 40), dtype=np.float32)  # w
    #arr_send = w[rank]
    arr_send = w[:, rank * 10:(rank + 1) * 10]

    #arr_recv = arr_send.zeros_like()
    #arr_recv = mxutils.mx.nd.zeros((kn, 5, 10), dtype=np.float32)  # recv
    #arr_recv = mxutils.mx.nd.zeros((5, kn * 10), dtype=np.float32)  # recv
    arr_recv = mxutils.mx.nd.zeros((kn * 10, 5), dtype=np.float32)  # recv.T

    #arr_send[1][1] = random.random()
    print(arr_send[1][1])

    #x#sz = 32 * 1000 * 10000
    sz = arr_send.size

    d_arr_send = arr_send.as_in_context(mxutils.context.gpu(gpu_i))
    d_arr_recv = arr_recv.as_in_context(mxutils.context.gpu(gpu_i))

    comm_i = nk.get_comm()
    nRanks = int(kn)  #2
    myRank = int(rank)  #0
    r = nk.comm_init_rank(byref(comm_i), nRanks, nuid, myRank)
    print('>>> ncclCommInitRank ', r)

    stream_i = nk.get_stream()

    # for test: rank-0 's sleep will block the others allreduce
    if rank == 0:
        print('-x' * 40, rank)
        time.sleep(10)
        print('=x' * 40, rank)

    r = nk.group_start()
    print('>>> ncclGroupStart ', r)

    ############
    t_arr_send = d_arr_send.T

    #p_arr_send = d_arr_send.get_data_p()
    p_arr_send = t_arr_send.get_data_p()
    p_arr_recv = d_arr_recv.get_data_p()
    '''
    r = nk.all_reduce(p_arr_send, p_arr_recv,
                      sz,
                      pynccl.binding.ncclFloat, pynccl.binding.ncclSum,
                      comm_i, stream_i.handle)  # NOTE:
                      #comm_i, c_void_p(0))  # NOTE:
    print('>>> ncclAllReduce ', rank, r)
    '''

    r = nk.all_gather(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat,
                      comm_i, stream_i.handle)
    print('>>> ncclAllGather ', r)

    r = nk.group_end()
    print('>>> ncclGroupEnd ', r)

    stream_i.synchronize()

    mxutils.mx.ndarray.ndarray.waitall()  ###

    r_arr = d_arr_recv.asnumpy()
    #print(r_arr.T)  # ############ the results is r_arr.T
    print(rank, w.asnumpy() == r_arr.T)

    r = nk.comm_destroy(comm_i)
    print('>>> ncclCommDestroy ', r)
示例#7
0
def gpu_worker_proc_5(api, kn, rank, gpu_i, q):

    # NOTE: do this at first of all
    cuda.select_device(gpu_i)

    nk = pynccl.Nccl()
    nc = nk._nccl  # cuNccl
    api = nc._api  # libnccl

    if rank == 0:

        nuid = pynccl.binding.ncclUniqueId()

        intnl_buf = chr(0) + chr(2) + 'nccl-%d-%d' % (os.getpid(), 0)
        intnl_buf += chr(0) * (pynccl.binding.NCCL_UNIQUE_ID_BYTES -
                               len(intnl_buf))
        nuid.internal = intnl_buf

        r = api.ncclGetUniqueId(byref(nuid))  # TODO:
        print('>>> ncclGetUniqueId ', r)

        for j in range(kn - 1):
            q.put(nuid)

    else:
        nuid = q.get()
    # -------------------------------------

    #arr_send = np.array(np.random.random((1000, 10000)), dtype=np.float32)
    #arr_recv = np.empty_like(arr_send)
    #print(arr_send[1][1])
    arr_send = mxutils.mx.nd.zeros((1000, 10000), dtype=np.float32)
    arr_recv = arr_send.zeros_like()
    arr_send[1][1] = random.random()
    print(arr_send[1][1])

    #x#sz = 32 * 1000 * 10000
    sz = arr_send.size

    ####cuda.select_device(gpu_i)

    #d_arr_send = cuda.to_device(arr_send)
    #d_arr_recv = cuda.to_device(arr_recv)
    d_arr_send = arr_send.as_in_context(mxutils.context.gpu(gpu_i))
    d_arr_recv = arr_recv.as_in_context(mxutils.context.gpu(gpu_i))

    comm_i = c_void_p(0)
    nRanks = int(kn)  #2
    myRank = int(rank)  #0
    r = api.ncclCommInitRank(byref(comm_i), nRanks, nuid, myRank)
    #x#r = api.ncclCommInitRank(byref(comm_i), nRanks, byref(nuid), myRank)
    print('>>> ncclCommInitRank ', r)

    stream_i = cuda.stream()

    r = api.ncclGroupStart()
    print('>>> ncclGroupStart ', r)

    #ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum,
    #    comms[i], s[i])

    #x#p_arr_send = cast(d_arr_send.device_ctypes_pointer, c_void_p)
    #x#p_arr_recv = cast(d_arr_recv.device_ctypes_pointer, c_void_p)

    #p_arr_send = c_void_p(d_arr_send.device_ctypes_pointer.value)  # NOTE:
    #p_arr_recv = c_void_p(d_arr_recv.device_ctypes_pointer.value)
    p_arr_send = d_arr_send.get_data_p()
    p_arr_recv = d_arr_recv.get_data_p()

    #pd_arr_send = cuda_driver.device_pointer(p_arr_send)  # int
    #pd_arr_recv = cuda_driver.device_pointer(d_arr_recv)  # int

    #r = api.ncclAllReduce(d_arr_send.device_ctypes_pointer, d_arr_recv.device_ctypes_pointer,
    r = api.ncclAllReduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat,
                          pynccl.binding.ncclSum, comm_i,
                          stream_i.handle)  # NOTE:
    #comm_i, c_void_p(0))  # NOTE:
    print('>>> ncclAllReduce ', r)

    r = api.ncclGroupEnd()
    print('>>> ncclGroupEnd ', r)

    stream_i.synchronize()

    r_arr = d_arr_recv.asnumpy()
    #print(r_arr)
    print(r_arr[1][1])

    #cuda.close()

    r = api.ncclCommDestroy(comm_i)
    print('>>> ncclCommDestroy ', r)
示例#8
0
def gpu_worker_proc_5_3(api, kn, rank, gpu_i, q):

    # NOTE: do this at first of all
    cuda.select_device(gpu_i)

    nk = pynccl.Nccl()
    #nc = nk._nccl  # cuNccl
    #api = nc._api  # libnccl

    if rank == 0:

        nuid0 = nk.get_unique_id()

        for j in range(kn - 1):
            q.put(nuid0)

    else:
        nuid0 = q.get()
    # -------------------------------------

    time.sleep(10)
    # -------------------------------------

    if rank == 1:

        nuid1 = nk.get_unique_id()

        for j in range(kn - 1):
            q.put(nuid1)

    else:
        if rank % 2 == 1:
            nuid1 = q.get()

    # -------------------------------------

    arr_send = mxutils.mx.nd.zeros((1000, 10000), dtype=np.float32)
    arr_recv = arr_send.zeros_like()
    arr_send[1][1] = random.random()
    print(arr_send[1][1])

    #x#sz = 32 * 1000 * 10000
    sz = arr_send.size

    d_arr_send = arr_send.as_in_context(mxutils.context.gpu(gpu_i))
    d_arr_recv = arr_recv.as_in_context(mxutils.context.gpu(gpu_i))

    # -------------------------------------

    comm_i0 = nk.get_comm()
    nRanks = int(kn)  #2
    myRank = int(rank)  #0
    r = nk.comm_init_rank(byref(comm_i0), nRanks, nuid0, myRank)
    print('>>> ncclCommInitRank ', r)

    stream_i0 = nk.get_stream()

    # for test: rank-0 's sleep will block the others allreduce
    if rank == 0:
        print('-x' * 40, rank)
        time.sleep(10)
        print('=x' * 40, rank)

    r = nk.group_start()
    print('>>> ncclGroupStart ', r)

    p_arr_send = d_arr_send.get_data_p()
    p_arr_recv = d_arr_recv.get_data_p()

    r = nk.all_reduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat,
                      pynccl.binding.ncclSum, comm_i0,
                      stream_i0.handle)  # NOTE:
    #comm_i, c_void_p(0))  # NOTE:
    print('>>> ncclAllReduce ', rank, r)

    r = nk.group_end()
    print('>>> ncclGroupEnd ', r)

    stream_i0.synchronize()

    r_arr = d_arr_recv.asnumpy()
    #print(r_arr)
    print(r_arr[1][1])

    #cuda.close()

    # -------------------------------------

    if rank % 2 == 1:
        comm_i1 = nk.get_comm()
        nRanks = int(kn / 2)  #2
        myRank = int(rank / 2)  # ####################################
        r = nk.comm_init_rank(byref(comm_i1), nRanks, nuid1, myRank)
        print('>>> 1 ncclCommInitRank ', r)

        stream_i1 = nk.get_stream()

        # for test: rank-0 's sleep will block the others allreduce
        if rank == 0:
            print('-x' * 40, rank)
            time.sleep(10)
            print('=x' * 40, rank)

        r = nk.group_start()
        print('>>> 1 ncclGroupStart ', r)

        p_arr_send = d_arr_send.get_data_p()
        p_arr_recv = d_arr_recv.get_data_p()

        r = nk.all_reduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat,
                          pynccl.binding.ncclSum, comm_i1,
                          stream_i1.handle)  # NOTE:
        #comm_i, c_void_p(0))  # NOTE:
        print('>>> 1 ncclAllReduce ', rank, r)

        r = nk.group_end()
        print('>>> 1 ncclGroupEnd ', r)

        stream_i1.synchronize()

        r_arr = d_arr_recv.asnumpy()
        #print(r_arr)
        print(r_arr[1][1])

        #cuda.close()

    else:
        time.sleep(15)

    # -------------------------------------

    r = nk.comm_destroy(comm_i0)
    if rank % 2 == 1:
        r = nk.comm_destroy(comm_i1)
    print('>>> ncclCommDestroy ', r)
示例#9
0
def gpu_worker_proc_4(api, rank, gpu_i, q):
    nk = pynccl.Nccl()
    nc = nk._nccl  # cuNccl
    api = nc._api  # libnccl

    if rank == 0:
        comms_a = c_void_p * 4
        int_a = c_int * 4

        # <1>
        comms = comms_a(0, 0, 0, 0)
        comms_p = cast(comms, POINTER(c_void_p))

        # <2>
        #comms = c_void_p(0)

        devs = int_a(0, 1, 2, 3)
        # <3>
        #devs_p = byref(devs)
        # <4>
        devs_p = cast(devs, POINTER(c_int))

        # <1>
        r = api.ncclCommInitAll(comms_p, 4, devs_p)
        # <2>
        #r = api.ncclCommInitAll(byref(comms), 4, devs_p)

        print('>>> ncclCommInitAll ', r)

        #x#cuda.close()
        q.put(comms)

    else:
        comms = q.get()
    # -------------------------------------

    arr_send = np.array(np.random.random((1000, 10000)), dtype=np.float32)
    arr_recv = np.empty_like(arr_send)
    print(arr_send[1][1])

    sz = 32 * 1000 * 10000

    cuda.select_device(gpu_i)

    d_arr_send = cuda.to_device(arr_send)
    d_arr_recv = cuda.to_device(arr_recv)

    comm_i = comms[i]
    stream_i = cuda.stream()

    r = api.ncclGroupStart()
    print('>>> ncclGroupStart ', r)

    #ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum,
    #    comms[i], s[i])

    r = api.ncclAllReduce(d_arr_send.device_ctypes_pointer, d_arr_recv.device_ctypes_pointer,
                          sz, pynccl.binding.ncclFloat, pynccl.binding.ncclSum,
                          comm_i, stream_i)
    print('>>> ncclAllReduce ', r)

    r = api.ncclGroupEnd()
    print('>>> ncclGroupEnd ', r)

    stream_i.synchronize()

    r_arr = d_arr_recv.copy_to_host()
    #print(r_arr)
    print(r_arr[1][1])

    #cuda.close()

    # -------------------------------------
    if rank == 0:
        for i in range(4):
            #print(comms[i])
            r = api.ncclCommDestroy(comms[i])
            print('>>> ncclCommDestroy ', r)
示例#10
0
def test_1_001():
    nk = pynccl.Nccl()
    print(dir(nk))
示例#11
0
def gpu_worker_proc_5_2(api, kn, rank, gpu_i, q):

    # NOTE: do this at first of all
    cuda.select_device(gpu_i)

    nk = pynccl.Nccl()
    #nc = nk._nccl  # cuNccl
    #api = nc._api  # libnccl

    if rank == 0:

        nuid = nk.get_unique_id()

        w = torch.Tensor(np.random.random((kn * 10, 5)))  # w
        print('w', w)

        for j in range(kn - 1):
            q.put((nuid, w))

    else:
        nuid, w = q.get()
    # -------------------------------------

    #arr_send = w[rank]
    arr_send = w[rank * 10:(rank + 1) * 10, :]

    arr_recv = torch.zeros((kn * 10, 5))  # recv

    #arr_send[1][1] = random.random()
    print(arr_send[1][1])

    #x#sz = arr_send.size
    sz = np.prod(arr_send.size())  #* arr_send.element_size()

    d_arr_send = arr_send.cuda(gpu_i)
    d_arr_recv = arr_recv.cuda(gpu_i)

    comm_i = nk.get_comm()
    nRanks = int(kn)  #2
    myRank = int(rank)  #0
    r = nk.comm_init_rank(byref(comm_i), nRanks, nuid, myRank)
    print(rank, '>>> ncclCommInitRank ', r)

    stream_i = nk.get_stream()

    # for test: rank-0 's sleep will block the others allreduce
    if rank == 0:
        print('-x' * 40, rank)
        time.sleep(3)
        print('=x' * 40, rank)

    r = nk.group_start()
    print(rank, '>>> ncclGroupStart ', r)

    # NOTE: in pytorch, the t() function of Tensor does NOT change the
    # original memory, so here we should create a new Tensor to nccl
    #t_arr_send = d_arr_send
    t_arr_send = torch.Tensor(d_arr_send.cpu()).cuda()

    #p_arr_send = d_arr_send.data_ptr()
    p_arr_send = t_arr_send.data_ptr()
    p_arr_recv = d_arr_recv.data_ptr()
    '''
    r = nk.all_reduce(p_arr_send, p_arr_recv,
                      sz,
                      pynccl.binding.ncclFloat, pynccl.binding.ncclSum,
                      comm_i, stream_i.handle)  # NOTE:
                      #comm_i, c_void_p(0))  # NOTE:
    print(rank, '>>> ncclAllReduce ', rank, r)
    '''

    r = nk.all_gather(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat,
                      comm_i, stream_i.handle)
    print(rank, '>>> ncclAllGather ', r)

    r = nk.group_end()
    print(rank, '>>> ncclGroupEnd ', r)

    stream_i.synchronize()

    r_arr = d_arr_recv.cpu().numpy()
    time.sleep(rank)
    print(rank, 'r_arr', r_arr)
    #print(rank, w.cpu().numpy() == r_arr)
    print(rank, (w.cpu().numpy() - r_arr) < 1e-6)

    r = nk.comm_destroy(comm_i)
    print(rank, '>>> ncclCommDestroy ', r)