def test_2_002(argv): nk = pynccl.Nccl() nc = nk._nccl # cuNccl api = nc._api # libnccl nuid = pynccl.binding.ncclUniqueId() print(nuid) print(type(nuid.internal)) #nuid.internal = pynccl.binding.NcclUniqueId_data_t() #nuid.internal = str(create_string_buffer('', 128)) ##nuid.internal = create_string_buffer('', 128) ''' intnl_str = create_string_buffer('', 128) print(intnl_str) for i in range(128): intnl_str[i] = chr(0) for i in range(12): intnl_str[i] = 'a' intnl_char_p = cast(intnl_str, c_char_p) #intnl_char_p = cast(intnl_str, pynccl.binding.NcclUniqueId_data_t) nuid.internal = intnl_char_p ''' #for i in range(128): # nuid.internal[i] = 44 # socket.AF_INET = 2 # 0x0002 ==> chr(0) + chr(2) #nuid.internal = chr(0) + chr(2) + 'nccl-%d-%d' % (os.getpid(), 0) # TODO: global counter intnl_buf = chr(0) + chr(2) + 'nccl-%d-%d' % (os.getpid(), 0) intnl_buf += chr(0) * (pynccl.binding.NCCL_UNIQUE_ID_BYTES - len(intnl_buf)) nuid.internal = intnl_buf print(nuid.internal) print(len(nuid.internal)) #print(dir(nuid.internal)) # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx r = api.ncclGetUniqueId(byref(nuid)) # TODO: print('>>> ncclGetUniqueId ', r) #print(nuid) ''' print(id(nuid.internal)) print(type(nuid.internal)) #print(len(nuid.internal)) print(nuid.internal) ''' comm = c_void_p(0) nRanks = int(argv[1]) #2 myRank = int(argv[2]) #0 r = api.ncclCommInitRank(byref(comm), nRanks, nuid, myRank) #x#r = api.ncclCommInitRank(byref(comm), nRanks, byref(nuid), myRank) print('>>> ncclCommInitRank ', r)
def test_2_001(): nk = pynccl.Nccl() nc = nk._nccl # cuNccl api = nc._api # libnccl comms_a = c_void_p * 4 int_a = c_int * 4 # <1> comms = comms_a(0, 0, 0, 0) comms_p = cast(comms, POINTER(c_void_p)) # <2> #comms = c_void_p(0) devs = int_a(0, 1, 2, 3) # <3> #devs_p = byref(devs) # <4> devs_p = cast(devs, POINTER(c_int)) # <1> r = api.ncclCommInitAll(comms_p, 4, devs_p) # <2> #r = api.ncclCommInitAll(byref(comms), 4, devs_p) print(r) for i in range(4): print(comms[i]) r = api.ncclCommDestroy(comms[i]) print(r)
def test_2_003(): nk = pynccl.Nccl() nc = nk._nccl # cuNccl api = nc._api # libnccl # ------------------------------------- procs = [] # ------------------------------------- comms_a = c_void_p * 4 int_a = c_int * 4 # <1> comms = comms_a(0, 0, 0, 0) comms_p = cast(comms, POINTER(c_void_p)) # <2> #comms = c_void_p(0) devs = int_a(0, 1, 2, 3) # <3> #devs_p = byref(devs) # <4> devs_p = cast(devs, POINTER(c_int)) # <1> r = api.ncclCommInitAll(comms_p, 4, devs_p) # <2> #r = api.ncclCommInitAll(byref(comms), 4, devs_p) print('>>> ncclCommInitAll ', r) #x#cuda.close() # ------------------------------------- for i in range(4): worker = mp.Process( target=gpu_worker_proc, args=(api, i, i, comms[i])) worker.daemon = True worker.start() procs.append(worker) for worker in procs: worker.join() # ------------------------------------- for i in range(4): #print(comms[i]) r = api.ncclCommDestroy(comms[i]) print('>>> ncclCommDestroy ', r)
def gpu_worker_proc_41_2(api, kn, rank, gpu_i, q): # NOTE: do this at first of all cuda.select_device(gpu_i) nk = pynccl.Nccl() #nc = nk._nccl # cuNccl #api = nc._api # libnccl pg0 = list(range(kn)) comm_0 = cre_nccl_comm_fn(nk, q, rank, pg0) pg1 = list(range(kn))[1:] comm_1 = cre_nccl_comm_fn(nk, q, rank, pg1) pg2 = list(range(kn))[2:] comm_2 = cre_nccl_comm_fn(nk, q, rank, pg2) time.sleep(2) nccl_fn_on_comm(nk, comm_0, q, rank, pg0, gpu_i) time.sleep(1) nccl_fn_on_comm(nk, comm_1, q, rank, pg1, gpu_i) time.sleep(1) nccl_fn_on_comm(nk, comm_2, q, rank, pg2, gpu_i) time.sleep(1) r = nk.comm_destroy(comm_0) print(rank, '>>> ncclCommDestroy ', r) r = nk.comm_destroy(comm_1) print(rank, '>>> ncclCommDestroy ', r) r = nk.comm_destroy(comm_2) print(rank, '>>> ncclCommDestroy ', r)
def test_2_004(): nk = pynccl.Nccl() nc = nk._nccl # cuNccl api = nc._api # libnccl # ------------------------------------- procs = [] q = mp.queues.Queue() # ------------------------------------- for i in range(4): worker = mp.Process( target=gpu_worker_proc_4, args=(api, i, i, q)) worker.daemon = True worker.start() procs.append(worker) for worker in procs: worker.join()
def gpu_worker_proc_5_2(api, kn, rank, gpu_i, q): # NOTE: do this at first of all cuda.select_device(gpu_i) nk = pynccl.Nccl() #nc = nk._nccl # cuNccl #api = nc._api # libnccl if rank == 0: nuid = nk.get_unique_id() #w = mxutils.mx.nd.array(np.random.random((kn, 5, 10)), dtype=np.float32) # w w = mxutils.mx.nd.array(np.random.random((5, kn * 10)), dtype=np.float32) # w print('w', w) for j in range(kn - 1): q.put((nuid, w)) else: nuid, w = q.get() # ------------------------------------- x = mxutils.mx.nd.array(np.random.random((7, 5)), dtype=np.float32) #arr_send = mxutils.mx.nd.array(np.random.random(5, 40), dtype=np.float32) # w #arr_send = w[rank] arr_send = w[:, rank * 10:(rank + 1) * 10] #arr_recv = arr_send.zeros_like() #arr_recv = mxutils.mx.nd.zeros((kn, 5, 10), dtype=np.float32) # recv #arr_recv = mxutils.mx.nd.zeros((5, kn * 10), dtype=np.float32) # recv arr_recv = mxutils.mx.nd.zeros((kn * 10, 5), dtype=np.float32) # recv.T #arr_send[1][1] = random.random() print(arr_send[1][1]) #x#sz = 32 * 1000 * 10000 sz = arr_send.size d_arr_send = arr_send.as_in_context(mxutils.context.gpu(gpu_i)) d_arr_recv = arr_recv.as_in_context(mxutils.context.gpu(gpu_i)) comm_i = nk.get_comm() nRanks = int(kn) #2 myRank = int(rank) #0 r = nk.comm_init_rank(byref(comm_i), nRanks, nuid, myRank) print('>>> ncclCommInitRank ', r) stream_i = nk.get_stream() # for test: rank-0 's sleep will block the others allreduce if rank == 0: print('-x' * 40, rank) time.sleep(10) print('=x' * 40, rank) r = nk.group_start() print('>>> ncclGroupStart ', r) ############ t_arr_send = d_arr_send.T #p_arr_send = d_arr_send.get_data_p() p_arr_send = t_arr_send.get_data_p() p_arr_recv = d_arr_recv.get_data_p() ''' r = nk.all_reduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat, pynccl.binding.ncclSum, comm_i, stream_i.handle) # NOTE: #comm_i, c_void_p(0)) # NOTE: print('>>> ncclAllReduce ', rank, r) ''' r = nk.all_gather(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat, comm_i, stream_i.handle) print('>>> ncclAllGather ', r) r = nk.group_end() print('>>> ncclGroupEnd ', r) stream_i.synchronize() mxutils.mx.ndarray.ndarray.waitall() ### r_arr = d_arr_recv.asnumpy() #print(r_arr.T) # ############ the results is r_arr.T print(rank, w.asnumpy() == r_arr.T) r = nk.comm_destroy(comm_i) print('>>> ncclCommDestroy ', r)
def gpu_worker_proc_5(api, kn, rank, gpu_i, q): # NOTE: do this at first of all cuda.select_device(gpu_i) nk = pynccl.Nccl() nc = nk._nccl # cuNccl api = nc._api # libnccl if rank == 0: nuid = pynccl.binding.ncclUniqueId() intnl_buf = chr(0) + chr(2) + 'nccl-%d-%d' % (os.getpid(), 0) intnl_buf += chr(0) * (pynccl.binding.NCCL_UNIQUE_ID_BYTES - len(intnl_buf)) nuid.internal = intnl_buf r = api.ncclGetUniqueId(byref(nuid)) # TODO: print('>>> ncclGetUniqueId ', r) for j in range(kn - 1): q.put(nuid) else: nuid = q.get() # ------------------------------------- #arr_send = np.array(np.random.random((1000, 10000)), dtype=np.float32) #arr_recv = np.empty_like(arr_send) #print(arr_send[1][1]) arr_send = mxutils.mx.nd.zeros((1000, 10000), dtype=np.float32) arr_recv = arr_send.zeros_like() arr_send[1][1] = random.random() print(arr_send[1][1]) #x#sz = 32 * 1000 * 10000 sz = arr_send.size ####cuda.select_device(gpu_i) #d_arr_send = cuda.to_device(arr_send) #d_arr_recv = cuda.to_device(arr_recv) d_arr_send = arr_send.as_in_context(mxutils.context.gpu(gpu_i)) d_arr_recv = arr_recv.as_in_context(mxutils.context.gpu(gpu_i)) comm_i = c_void_p(0) nRanks = int(kn) #2 myRank = int(rank) #0 r = api.ncclCommInitRank(byref(comm_i), nRanks, nuid, myRank) #x#r = api.ncclCommInitRank(byref(comm_i), nRanks, byref(nuid), myRank) print('>>> ncclCommInitRank ', r) stream_i = cuda.stream() r = api.ncclGroupStart() print('>>> ncclGroupStart ', r) #ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum, # comms[i], s[i]) #x#p_arr_send = cast(d_arr_send.device_ctypes_pointer, c_void_p) #x#p_arr_recv = cast(d_arr_recv.device_ctypes_pointer, c_void_p) #p_arr_send = c_void_p(d_arr_send.device_ctypes_pointer.value) # NOTE: #p_arr_recv = c_void_p(d_arr_recv.device_ctypes_pointer.value) p_arr_send = d_arr_send.get_data_p() p_arr_recv = d_arr_recv.get_data_p() #pd_arr_send = cuda_driver.device_pointer(p_arr_send) # int #pd_arr_recv = cuda_driver.device_pointer(d_arr_recv) # int #r = api.ncclAllReduce(d_arr_send.device_ctypes_pointer, d_arr_recv.device_ctypes_pointer, r = api.ncclAllReduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat, pynccl.binding.ncclSum, comm_i, stream_i.handle) # NOTE: #comm_i, c_void_p(0)) # NOTE: print('>>> ncclAllReduce ', r) r = api.ncclGroupEnd() print('>>> ncclGroupEnd ', r) stream_i.synchronize() r_arr = d_arr_recv.asnumpy() #print(r_arr) print(r_arr[1][1]) #cuda.close() r = api.ncclCommDestroy(comm_i) print('>>> ncclCommDestroy ', r)
def gpu_worker_proc_5_3(api, kn, rank, gpu_i, q): # NOTE: do this at first of all cuda.select_device(gpu_i) nk = pynccl.Nccl() #nc = nk._nccl # cuNccl #api = nc._api # libnccl if rank == 0: nuid0 = nk.get_unique_id() for j in range(kn - 1): q.put(nuid0) else: nuid0 = q.get() # ------------------------------------- time.sleep(10) # ------------------------------------- if rank == 1: nuid1 = nk.get_unique_id() for j in range(kn - 1): q.put(nuid1) else: if rank % 2 == 1: nuid1 = q.get() # ------------------------------------- arr_send = mxutils.mx.nd.zeros((1000, 10000), dtype=np.float32) arr_recv = arr_send.zeros_like() arr_send[1][1] = random.random() print(arr_send[1][1]) #x#sz = 32 * 1000 * 10000 sz = arr_send.size d_arr_send = arr_send.as_in_context(mxutils.context.gpu(gpu_i)) d_arr_recv = arr_recv.as_in_context(mxutils.context.gpu(gpu_i)) # ------------------------------------- comm_i0 = nk.get_comm() nRanks = int(kn) #2 myRank = int(rank) #0 r = nk.comm_init_rank(byref(comm_i0), nRanks, nuid0, myRank) print('>>> ncclCommInitRank ', r) stream_i0 = nk.get_stream() # for test: rank-0 's sleep will block the others allreduce if rank == 0: print('-x' * 40, rank) time.sleep(10) print('=x' * 40, rank) r = nk.group_start() print('>>> ncclGroupStart ', r) p_arr_send = d_arr_send.get_data_p() p_arr_recv = d_arr_recv.get_data_p() r = nk.all_reduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat, pynccl.binding.ncclSum, comm_i0, stream_i0.handle) # NOTE: #comm_i, c_void_p(0)) # NOTE: print('>>> ncclAllReduce ', rank, r) r = nk.group_end() print('>>> ncclGroupEnd ', r) stream_i0.synchronize() r_arr = d_arr_recv.asnumpy() #print(r_arr) print(r_arr[1][1]) #cuda.close() # ------------------------------------- if rank % 2 == 1: comm_i1 = nk.get_comm() nRanks = int(kn / 2) #2 myRank = int(rank / 2) # #################################### r = nk.comm_init_rank(byref(comm_i1), nRanks, nuid1, myRank) print('>>> 1 ncclCommInitRank ', r) stream_i1 = nk.get_stream() # for test: rank-0 's sleep will block the others allreduce if rank == 0: print('-x' * 40, rank) time.sleep(10) print('=x' * 40, rank) r = nk.group_start() print('>>> 1 ncclGroupStart ', r) p_arr_send = d_arr_send.get_data_p() p_arr_recv = d_arr_recv.get_data_p() r = nk.all_reduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat, pynccl.binding.ncclSum, comm_i1, stream_i1.handle) # NOTE: #comm_i, c_void_p(0)) # NOTE: print('>>> 1 ncclAllReduce ', rank, r) r = nk.group_end() print('>>> 1 ncclGroupEnd ', r) stream_i1.synchronize() r_arr = d_arr_recv.asnumpy() #print(r_arr) print(r_arr[1][1]) #cuda.close() else: time.sleep(15) # ------------------------------------- r = nk.comm_destroy(comm_i0) if rank % 2 == 1: r = nk.comm_destroy(comm_i1) print('>>> ncclCommDestroy ', r)
def gpu_worker_proc_4(api, rank, gpu_i, q): nk = pynccl.Nccl() nc = nk._nccl # cuNccl api = nc._api # libnccl if rank == 0: comms_a = c_void_p * 4 int_a = c_int * 4 # <1> comms = comms_a(0, 0, 0, 0) comms_p = cast(comms, POINTER(c_void_p)) # <2> #comms = c_void_p(0) devs = int_a(0, 1, 2, 3) # <3> #devs_p = byref(devs) # <4> devs_p = cast(devs, POINTER(c_int)) # <1> r = api.ncclCommInitAll(comms_p, 4, devs_p) # <2> #r = api.ncclCommInitAll(byref(comms), 4, devs_p) print('>>> ncclCommInitAll ', r) #x#cuda.close() q.put(comms) else: comms = q.get() # ------------------------------------- arr_send = np.array(np.random.random((1000, 10000)), dtype=np.float32) arr_recv = np.empty_like(arr_send) print(arr_send[1][1]) sz = 32 * 1000 * 10000 cuda.select_device(gpu_i) d_arr_send = cuda.to_device(arr_send) d_arr_recv = cuda.to_device(arr_recv) comm_i = comms[i] stream_i = cuda.stream() r = api.ncclGroupStart() print('>>> ncclGroupStart ', r) #ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum, # comms[i], s[i]) r = api.ncclAllReduce(d_arr_send.device_ctypes_pointer, d_arr_recv.device_ctypes_pointer, sz, pynccl.binding.ncclFloat, pynccl.binding.ncclSum, comm_i, stream_i) print('>>> ncclAllReduce ', r) r = api.ncclGroupEnd() print('>>> ncclGroupEnd ', r) stream_i.synchronize() r_arr = d_arr_recv.copy_to_host() #print(r_arr) print(r_arr[1][1]) #cuda.close() # ------------------------------------- if rank == 0: for i in range(4): #print(comms[i]) r = api.ncclCommDestroy(comms[i]) print('>>> ncclCommDestroy ', r)
def test_1_001(): nk = pynccl.Nccl() print(dir(nk))
def gpu_worker_proc_5_2(api, kn, rank, gpu_i, q): # NOTE: do this at first of all cuda.select_device(gpu_i) nk = pynccl.Nccl() #nc = nk._nccl # cuNccl #api = nc._api # libnccl if rank == 0: nuid = nk.get_unique_id() w = torch.Tensor(np.random.random((kn * 10, 5))) # w print('w', w) for j in range(kn - 1): q.put((nuid, w)) else: nuid, w = q.get() # ------------------------------------- #arr_send = w[rank] arr_send = w[rank * 10:(rank + 1) * 10, :] arr_recv = torch.zeros((kn * 10, 5)) # recv #arr_send[1][1] = random.random() print(arr_send[1][1]) #x#sz = arr_send.size sz = np.prod(arr_send.size()) #* arr_send.element_size() d_arr_send = arr_send.cuda(gpu_i) d_arr_recv = arr_recv.cuda(gpu_i) comm_i = nk.get_comm() nRanks = int(kn) #2 myRank = int(rank) #0 r = nk.comm_init_rank(byref(comm_i), nRanks, nuid, myRank) print(rank, '>>> ncclCommInitRank ', r) stream_i = nk.get_stream() # for test: rank-0 's sleep will block the others allreduce if rank == 0: print('-x' * 40, rank) time.sleep(3) print('=x' * 40, rank) r = nk.group_start() print(rank, '>>> ncclGroupStart ', r) # NOTE: in pytorch, the t() function of Tensor does NOT change the # original memory, so here we should create a new Tensor to nccl #t_arr_send = d_arr_send t_arr_send = torch.Tensor(d_arr_send.cpu()).cuda() #p_arr_send = d_arr_send.data_ptr() p_arr_send = t_arr_send.data_ptr() p_arr_recv = d_arr_recv.data_ptr() ''' r = nk.all_reduce(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat, pynccl.binding.ncclSum, comm_i, stream_i.handle) # NOTE: #comm_i, c_void_p(0)) # NOTE: print(rank, '>>> ncclAllReduce ', rank, r) ''' r = nk.all_gather(p_arr_send, p_arr_recv, sz, pynccl.binding.ncclFloat, comm_i, stream_i.handle) print(rank, '>>> ncclAllGather ', r) r = nk.group_end() print(rank, '>>> ncclGroupEnd ', r) stream_i.synchronize() r_arr = d_arr_recv.cpu().numpy() time.sleep(rank) print(rank, 'r_arr', r_arr) #print(rank, w.cpu().numpy() == r_arr) print(rank, (w.cpu().numpy() - r_arr) < 1e-6) r = nk.comm_destroy(comm_i) print(rank, '>>> ncclCommDestroy ', r)