def get_intranode_pair_comm(self, pair): '''a gpucomm between the two processes in the pair''' # pair is the a size-two tuple of the MPI ranks of the server (rank=0) and a worker from pygpu import collectives _local_id = collectives.GpuCommCliqueId(context=self.ctx) string = _local_id.comm_id.decode('utf-8') comm = self.comm rank = comm.rank size = comm.size # if rank==0: # _string=string # comm.send(_string, dest=1) # else: # # _string = comm.recv(source=0) if rank == pair[0]: _string = comm.recv(source=pair[1], tag=220) else: _string = string comm.send(_string, dest=pair[0], tag=220) #print _string, string, _string==string # len_pid =len(str(pid)) # # # replace the process-unique id to be the universal id "0......" so that a intranode gpucomm can be created # # pair_index=0 # # replacement = ''.join(('%d' % pair_index) for i in range(len_pid)) # _string = string.replace(str(pid), replacement) _local_id.comm_id = bytearray(_string.encode('utf-8')) _local_size = len( pair) # how many intra-node processes, pair usually means 2 if self.rank == pair[0]: _local_rank = 0 else: _local_rank = 1 gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank) #print 'on rank %d, pair %s generated' % (self.rank, pair) return gpucomm
def __init__(self, n_gpu, rank, master_rank): gpu_ctx = theano.gpuarray.get_context(None) clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx) if rank == master_rank: sync.dict["gpu_comm_id"] = clique_id.comm_id sync.barrier.wait() else: sync.barrier.wait() clique_id.comm_id = sync.dict["gpu_comm_id"] self.comm = gpu_coll.GpuComm(clique_id, n_gpu, rank) self.n_gpu = n_gpu self.avg_fac = 1. / n_gpu self.master_rank = master_rank
def init_comm(self, n_itr, log_interval_itrs): import theano.gpuarray from pygpu import collectives as gpu_coll gpu_ctx = theano.gpuarray.get_context(None) clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx) self.par_objs.dict["gpu_comm_id"] = clique_id.comm_id self.par_objs.dict["n_itr"] = n_itr self.par_objs.dict["log_interval_itrs"] = log_interval_itrs self.par_objs.dict[ "initial_param_values"] = self.policy.get_param_values() self.par_objs.barrier.wait() gpu_comm = gpu_coll.GpuComm(clique_id, self.n_runners, self.rank) self.algo.optimizer.init_comm(gpu_comm, self.rank, self.n_runners)
def init_gpu(rank, n_gpu, sync, is_master=True): """ Happens after atexit.register(_close) in master and when g.forked=False, but before atexit.register(error_close) in workers, so should be careful. TODO: probably can simplify or otherwise improve the error catching. """ dev_str = "cuda" + str(rank) try: import theano.gpuarray theano.gpuarray.use(dev_str) from pygpu import collectives as gpu_coll gpu_ctx = theano.gpuarray.get_context(None) clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx) except ImportError as e: if is_master: raise e # (only master raises ImportError, will join subprocesses) else: return # (workers exit quietly) except Exception as e: sync.exct.workers_OK.value = False # (let others know it failed) raise e finally: sync.init.barriers.gpu_inits[0].wait() if not sync.exct.workers_OK.value: return False # (someone else failed) if is_master: sync.init.dict["comm_id"] = clique_id.comm_id sync.init.barriers.gpu_inits[1].wait() else: sync.init.barriers.gpu_inits[1].wait() clique_id.comm_id = sync.init.dict["comm_id"] try: gpu_comm = gpu_coll.GpuComm(clique_id, n_gpu, rank) except Exception as e: sync.exct.workers_OK.value = False raise e finally: sync.init.barriers.gpu_inits[2].wait() if not sync.exct.workers_OK.value: return False # (someone else failed) else: return gpu_comm # (success)
def init_nccl_env(mpi_comm): from pygpu import collectives as gpucoll from theano import gpuarray as theanoga gpu_name = None gpu_ctx = theanoga.get_context(gpu_name) commid = gpucoll.GpuCommCliqueId(gpu_ctx) mpi_rank = mpi_comm.Get_rank() mpi_size = mpi_comm.Get_size() data = commid.comm_id if mpi_rank == 0 else None data = mpi_comm.bcast(data, root=0) commid.comm_id = data comm = gpucoll.GpuComm(commid, mpi_size, mpi_rank) print('Init pygpu OK, rank %d' % mpi_rank) sys.stdout.flush() return comm
def _register_to_platoon(self): """ Asks Controller for configuration information and creates a NCCL communicator that participate in the local node's workers world. For this it is needed that Theano is imported. Through Theano, this methods gets access to the single GPU context of this worker process. This context is to be used in all computations done by a worker's process. .. note:: It is necessary that this initialization method is called successfully before :meth:`all_reduce` in order to be available and functional. .. versionadded:: 0.6.0 """ if pygpu: self.ctx_name = None self.gpuctx = theanoga.get_context(self.ctx_name) self.device = theanoconf.device self._local_id = gpucoll.GpuCommCliqueId(context=self.gpuctx) # Ask controller for local's info to participate in response = self.send_req("platoon-get_platoon_info", info={ 'device': self.device, 'local_id': self._local_id.comm_id.decode('utf-8') }) self._local_id.comm_id = bytearray( response['local_id'].encode('utf-8')) self._local_size = response['local_size'] self._local_rank = response['local_rank'] self._local_comm = gpucoll.GpuComm(self._local_id, self._local_size, self._local_rank) self._multinode = response['multinode'] self._global_size = response['global_size'] self._global_rank = response['global_rank'] else: raise AttributeError("pygpu or theano is not imported")
def get_intranode_comm_pair(self, pre_random_array): _local_id = collectives.GpuCommCliqueId(context=self.ctx) string = _local_id.comm_id.decode('utf-8') import os pid = str(os.getpid()) len_pid = len(pid) # replace the process-unique id to be the universal id "0......" so that a intranode gpucomm can be created pair = [] for index, tmp_pair in enumerate(pre_random_array): if (tmp_pair[0] == self.interrank) or (tmp_pair[1] == self.interrank): # print "Found it !" ,tmp_pair pair = tmp_pair pair_index = index break assert pair_index <= 9 replacement = ''.join(('%d' % pair_index) for i in range(len_pid)) _string = string.replace(pid, replacement) _local_id.comm_id = bytearray(_string.encode('utf-8')) _local_size = len( pair ) # how many intra-node workers, in the case of copper maximum 8 workers per node, assuming running within a node here if self.interrank == pair[0]: _local_rank = 0 else: _local_rank = 1 _local_rank = _local_rank # assuming running within a node here gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank) if self.test == True: print 'on rank %d, pair %s generated' % (self.interrank, pair) return gpucomm, pair
def get_intranode_comm(self): '''a gpucomm between all synchronous workers''' from pygpu import collectives _local_id = collectives.GpuCommCliqueId(context=self.ctx) string = _local_id.comm_id.decode('utf-8') comm = self.comm rank = comm.rank size = comm.size if rank == 0: _string = string else: _string = None _string = comm.bcast(_string, root=0) _local_id.comm_id = bytearray(_string.encode('utf-8')) # make intranode gpucomms, assuming running on multiple nodes # 1. get a list of all host-rank strings import os #print os.uname()[1],os.environ['CPULIST_train'] hosts = [os.uname()[1] + ",%d" % self.rank] import numpy as np hosts = np.array(comm.allgather(hosts)).flatten().tolist() # 2. get a list of local host-rank strings localhost = [host for host in hosts if host.startswith(os.uname()[1])] # 3. count how many local ranks by counting the local host-rank strings (_local_size) _local_size = len(localhost) _local_rank = 0 # 4. give self a rank among those ranks (_local_rank) for index, host in enumerate(localhost): if host == os.uname()[1] + ",%d" % self.rank: _local_rank = index break self.gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank)
def get_intranode_comm(rank, size, ctx): from pygpu import collectives _local_id = collectives.GpuCommCliqueId(context=ctx) string = _local_id.comm_id.decode('utf-8') import os pid = str(os.getpid()) len_pid = len(pid) # replace the process-unique id to be the universal id "0......" so that a intranode gpucomm can be created replacement = ''.join('0' for i in range(len_pid)) _string = string.replace(pid, replacement) _local_id.comm_id = bytearray(_string.encode('utf-8')) _local_size = size # how many intra-node workers, in the case of copper maximum 8 workers per node, assuming running within a node here _local_rank = rank # assuming running within a node here gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank) return gpucomm
def __init__(self, rank, world_size, port_pub_sub, port_push_pull, job_id): self.rank = rank self.world_size = world_size self.port_pub_sub = port_pub_sub self.port_push_pull = port_push_pull self.job_id = job_id self._lock = posix_ipc.Semaphore("{}_lock".format(self.job_id)) self.gpu_ctx = gpuarray.get_context(None) self.local_id = collectives.GpuCommCliqueId(context=self.gpu_ctx) self.lock() comm_id_file = 'comm_id.pkl' if not os.path.isfile(comm_id_file): comm_id = self.local_id.comm_id utils.dump_pkl(comm_id, comm_id_file) else: comm_id = utils.load_pkl(comm_id_file) self.local_id.comm_id = comm_id self.unlock() print 'local_id ', self.local_id.comm_id # the following call is blocked till all workers finish calling it #print self.local_id.comm_id, self.job_id self.local_comm = collectives.GpuComm(self.local_id, self.world_size, self.rank) self.init_socket() print 'finish init worker with rank %d'%rank