def copy_to_local(self): assert self.comm != None if self.etype == 'server': # copy weight from g_param to g_param_ga for g_param, g_param_ga in \ zip(self.g_param_list, self.g_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(g_param.container.value) self.drv.memcpy_dtod(g_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) # Send(self, buf, int dest=0, int tag=0) mpitp = dtype_to_mpi(self.g_param_ga_list[0].dtype) for g_param_ga in self.g_param_ga_list: self.comm.Send(buf = [bufint(g_param_ga), mpitp], dest = self.dest) elif self.etype == 'worker': mpitp = dtype_to_mpi(self.w_param_ga_list[0].dtype) for w_param_ga in self.w_param_ga_list: self.comm.Recv(buf = [bufint(w_param_ga), mpitp], source = self.dest) # copy weight from w_param_ga to w_param for w_param_ga, w_param in \ zip(self.w_param_ga_list, self.w_param_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(w_param.container.value) self.drv.memcpy_dtod(w_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) self.comm = None
def copy_to_local(self): assert self.comm != None if self.etype == 'server': # copy weight from g_param to g_param_ga for g_param, g_param_ga in \ zip(self.g_param_list, self.g_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(g_param.container.value) self.drv.memcpy_dtod(g_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) # Send(self, buf, int dest=0, int tag=0) mpitp = dtype_to_mpi(self.g_param_ga_list[0].dtype) for g_param_ga in self.g_param_ga_list: self.comm.Send(buf=[bufint(g_param_ga), mpitp], dest=self.dest) elif self.etype == 'worker': mpitp = dtype_to_mpi(self.w_param_ga_list[0].dtype) for w_param_ga in self.w_param_ga_list: self.comm.Recv(buf=[bufint(w_param_ga), mpitp], source=self.dest) # copy weight from w_param_ga to w_param for w_param_ga, w_param in \ zip(self.w_param_ga_list, self.w_param_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(w_param.container.value) self.drv.memcpy_dtod(w_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) self.comm = None
def exchange(self): # server and worker send param to each other # this function needs the worker to send an 'exchange' message # to the server through REQ-REP socket first. assert self.comm != None if self.etype == 'server': # copy weight from g_param to g_param_ga for g_param, g_param_ga in \ zip(self.g_param_list, self.g_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(g_param.container.value) self.drv.memcpy_dtod(g_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None) for g_param_ga, w_param_ga in zip(self.g_param_ga_list, self.w_param_ga_list): self.comm.Sendrecv(sendbuf = [bufint(g_param_ga), MPI.FLOAT], dest = self.dest, recvbuf = [bufint(w_param_ga), MPI.FLOAT], source = self.dest, ) # copy weight from w_param_ga to w_param for w_param, w_param_ga in \ zip(self.w_param_list, self.w_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(w_param.container.value) self.drv.memcpy_dtod(param_ga.ptr, w_param_ga.ptr, w_param_ga.dtype.itemsize * w_param_ga.size) elif self.etype == 'worker': # copy weight from w_param to w_param_ga for w_param, w_param_ga in \ zip(self.w_param_list, self.w_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(w_param.container.value) self.drv.memcpy_dtod(w_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None) for w_param_ga, g_param_ga in zip(self.w_param_ga_list, self.g_param_ga_list): self.comm.Sendrecv(sendbuf = [bufint(w_param_ga), MPI.FLOAT], dest = self.dest, recvbuf = [bufint(g_param_ga), MPI.FLOAT], source = self.dest, ) # copy weight from w_param_ga to w_param for g_param, g_param_ga in \ zip(self.g_param_list, self.g_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(g_param.container.value) self.drv.memcpy_dtod(param_ga.ptr, g_param_ga.ptr, g_param_ga.dtype.itemsize * g_param_ga.size) self.update_func() self.comm = None
def exchange(self): mpidtype = self.mpidtype if self.avg: self.avg_func() # copy weight from param_ga to param_update_ga for param, param_update_ga in \ zip(self.source_param_list, self.param_update_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) self.drv.memcpy_dtod(param_update_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) self.ctx.synchronize() if (self.size == 2): for param_update_ga,d_param_tmp,numElements,grid_size in \ zip(self.param_update_ga_list, \ self.d_param_32_tmp_list, \ self.numElements_list, \ self.grid_size_list): ''' Summing and Sharing GPU Data Sendrecv Pairing: 0 and 1 ''' if (self.rank == 1): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=0, recvbuf=[bufint(d_param_tmp), mpidtype], source=0) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() #should synchronize context after a kernel call # to make sure the kernel has been finished elif (self.rank == 0): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=1, recvbuf=[bufint(d_param_tmp), mpidtype], source=1) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() #should synchronize context after a kernel call # to make sure the kernel has been finished self.comm.Barrier() elif (self.size == 4): for param_update_ga,d_param_tmp,numElements,grid_size in \ zip(self.param_update_ga_list, \ self.d_param_32_tmp_list, \ self.numElements_list, \ self.grid_size_list): ''' Summing GPU Data Step 1 Source GPU -> Destination GPU 1 -> 0, 3 -> 2 ''' if (self.rank %2 == 1): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-1) elif (self.rank %2 == 0): self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank+1) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Step 2 Sendrecv Pairing: 0 and 2 ''' if (self.rank == 2): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=0, recvbuf=[bufint(d_param_tmp), mpidtype], source=0) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() elif (self.rank == 0): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=2, recvbuf=[bufint(d_param_tmp), mpidtype], source=2) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Broadcasting Result Source GPU -> Destination GPU 0 -> 1, 2 -> 3 ''' if (self.rank %2 == 0): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+1) elif (self.rank %2 == 1): self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank-1) self.comm.Barrier() elif (self.size == 8): # Use this for parameter size < 16MB # Use Fei's implementation for parameter size > 16MB for param_update_ga,d_param_tmp,numElements,grid_size in \ zip(self.param_update_ga_list, \ self.d_param_32_tmp_list, \ self.numElements_list, \ self.grid_size_list): ''' Summing GPU Data Step 1 Source GPU -> Destination GPU 1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6 ''' if (self.rank %2 == 1): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-1) elif (self.rank %2 == 0): self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank+1) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Step 2 Source GPU -> Destination GPU 0 -> 2, 4 -> 6 ''' if (self.rank %4 == 0): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+2) elif (self.rank == 2) or (self.rank == 6): self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank-2) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Step 3 Sendrecv Pairing: 2 and 6 ''' if (self.rank == 2): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=6, recvbuf=[bufint(d_param_tmp), mpidtype], source=6) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() elif (self.rank == 6): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=2, recvbuf=[bufint(d_param_tmp), mpidtype], source=2) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Broadcasting Results Step 1 Source GPU -> Destination GPU 2 -> 0, 6 -> 4 ''' if (self.rank == 2) or (self.rank == 6): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-2) elif (self.rank %4 == 0): self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank+2) ''' Step 2 Source GPU -> Destination GPU 0 -> 1, 2 -> 3, 4 -> 5, 6 -> 7 ''' if (self.rank %2 == 0): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+1) elif (self.rank %2 == 1): self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank-1) self.comm.Barrier() elif (self.size == 16): for param_update_ga,d_param_tmp,numElements,grid_size in \ zip(self.param_update_ga_list, \ self.d_param_32_tmp_list, \ self.numElements_list, \ self.grid_size_list): ''' Summing GPU Data Step 1 Source GPU -> Destination GPU 1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6, 9 -> 8, 11 -> 10, 13 -> 12, 15 -> 14 ''' if (self.rank %2 == 1): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-1) elif (self.rank %2 == 0): self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank+1) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Step 2 Source GPU -> Destination GPU 0 -> 2, 4 -> 6, 8 -> 10, 12 -> 14 ''' if (self.rank %4 == 0): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+2) elif (self.rank == 2) or (self.rank == 6) or (self.rank == 10) or (self.rank == 14): self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank-2) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Step 3 Source GPU -> Destination GPU 2 -> 6, 10 -> 14 ''' if (self.rank == 2) or (self.rank == 10): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+4) elif (self.rank == 6) or (self.rank == 14): self.comm.Recv([bufint(d_param_tmp), mpidtype], source=self.rank-4) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Step 4 Sendrecv Pairing: 6 and 14 ''' if (self.rank == 6): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=14, recvbuf=[bufint(d_param_tmp), mpidtype], source=14) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() elif (self.rank == 14): self.comm.Sendrecv([bufint(param_update_ga), mpidtype], \ dest=6, recvbuf=[bufint(d_param_tmp), mpidtype], source=6) self.vecadd(param_update_ga, d_param_tmp, numElements, \ block=(256, 1, 1), grid=grid_size) self.ctx.synchronize() ''' Broadcasting Result Step 1 Source GPU -> Destination GPU 6 -> 2, 14 -> 10 ''' if (self.rank == 6) or (self.rank == 14): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-4) elif (self.rank == 2) or (self.rank == 10): self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank+4) ''' Step 2 Source GPU -> Destination GPU 2 -> 0, 6 -> 4, 10 -> 8, 14 -> 12 ''' if (self.rank == 2) or (self.rank == 6) or (self.rank == 10) or (self.rank == 14): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank-2) elif (self.rank %4 == 0): self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank+2) ''' Step 3 Source GPU -> Destination GPU 0 -> 1, 2 -> 3, 4 -> 5, 6 -> 7, 8 -> 9, 10 -> 11, 12 -> 13, 14 -> 15 ''' if (self.rank %2 == 0): self.comm.Send([bufint(param_update_ga), mpidtype], dest=self.rank+1) elif (self.rank %2 == 1): self.comm.Recv([bufint(param_update_ga), mpidtype], source=self.rank-1) self.comm.Barrier() # copy weight from param_update_ga back to param_ga for param, param_update_ga in \ zip(self.dest_param_list, self.param_update_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) self.drv.memcpy_dtod(param_ga.ptr, param_update_ga.ptr, param_update_ga.dtype.itemsize * param_ga.size) self.ctx.synchronize()
def exchange(self): mpidtype = self.mpidtype # divding source param first before exchanging if self.avg: self.avg_func() # copy weight from param_ga to param_update_ga for param, param_update_ga in \ zip(self.source_param_list, self.param_update_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) self.drv.memcpy_dtod(param_update_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) self.ctx.synchronize() # allreduce weight from param_update_ga to itself wcount=0 for param_update_ga in self.param_update_ga_list: self.float2half(param_update_ga, self.d_param_16_list[wcount], \ self.numElements_list[wcount], self.offset_list[wcount], \ block=(256,1,1),grid=self.grid_size_list[wcount]) self.ctx.synchronize() self.comm.Alltoall( [bufint(self.d_param_16_list[wcount]), mpidtype],\ [bufint(self.d_param_16_tmp_list[wcount]),mpidtype]) self.sumhalfs(self.d_param_16_tmp_list[wcount], \ self.d_param_16_sum_list[wcount], \ self.reduce_size_list[wcount],self.ranksize,\ self.reduce_size_list[wcount], \ block=(256,1,1),grid=self.grid_sum_size_list[wcount]) self.ctx.synchronize() self.comm.Allgather( [bufint(self.d_param_16_sum_list[wcount]),mpidtype],\ [bufint(self.d_param_16_update_list[wcount]),mpidtype]) # d_param_16_update_list redundant self.half2float(self.d_param_16_update_list[wcount], param_update_ga, \ self.numElements_list[wcount],self.offset_list[wcount], \ block=(256,1,1),grid=self.grid_size_list[wcount]) # d_param_16_update_list redundant self.ctx.synchronize() wcount+=1 # copy weight from param_reduce_ga back to param_ga for param, param_update_ga in \ zip(self.dest_param_list, self.param_update_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) self.drv.memcpy_dtod(param_ga.ptr, param_update_ga.ptr, param_update_ga.dtype.itemsize * param_ga.size) self.ctx.synchronize()
def exchange(self): mpidtype = self.mpidtype # divding source param first before exchanging if self.avg: self.avg_func() # copy weight from param_ga to param_update_ga for param, param_update_ga in \ zip(self.source_param_list, self.param_update_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) self.drv.memcpy_dtod(param_update_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) self.ctx.synchronize() # allreduce weight from param_update_ga to itself wcount=0 for param_update_ga in self.param_update_ga_list: self.comm.Alltoall( [bufint(param_update_ga), mpidtype], [bufint(self.d_param_32_tmp_list[wcount]),\ mpidtype]) # sumfloats(float* f1, float* f2, int numElements,int ranksize,int reducesize) self.d_f32_sumfloats(self.d_param_32_tmp_list[wcount], \ self.d_param_32_sum_list[wcount],\ self.reduce_size_list[wcount],self.ranksize,\ self.reduce_size_list[wcount],\ block=(256,1,1),\ grid=self.grid_sum_size_list[wcount]) self.ctx.synchronize() self.comm.Allgather(\ [bufint(self.d_param_32_sum_list[wcount]),mpidtype], \ [bufint(param_update_ga),mpidtype]) #param.container.value.release_buffer(param_buf) wcount = wcount +1 # copy weight from param_reduce_ga back to param_ga for param, param_update_ga in \ zip(self.dest_param_list, self.param_update_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) self.drv.memcpy_dtod(param_ga.ptr, param_update_ga.ptr, param_update_ga.dtype.itemsize * param_ga.size) self.ctx.synchronize()
def exchange(self): # server and worker send param to each other # this function needs the worker to send an 'exchange' message # to the server through REQ-REP socket first. assert self.comm != None if self.etype == 'server': # copy weight from g_param to g_param_ga for g_param, g_param_ga in \ zip(self.g_param_list, self.g_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(g_param.container.value) self.drv.memcpy_dtod(g_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None) for g_param_ga, w_param_ga in zip(self.g_param_ga_list, self.w_param_ga_list): self.comm.Sendrecv( sendbuf=[bufint(g_param_ga), MPI.FLOAT], dest=self.dest, recvbuf=[bufint(w_param_ga), MPI.FLOAT], source=self.dest, ) # copy weight from w_param_ga to w_param for w_param, w_param_ga in \ zip(self.w_param_list, self.w_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(w_param.container.value) self.drv.memcpy_dtod( param_ga.ptr, w_param_ga.ptr, w_param_ga.dtype.itemsize * w_param_ga.size) elif self.etype == 'worker': # copy weight from w_param to w_param_ga for w_param, w_param_ga in \ zip(self.w_param_list, self.w_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(w_param.container.value) self.drv.memcpy_dtod(w_param_ga.ptr, param_ga.ptr, param_ga.dtype.itemsize * param_ga.size) # Sendrecv(self, sendbuf, int dest=0, int sendtag=0, recvbuf=None, int source=0, int recvtag=0, Status status=None) for w_param_ga, g_param_ga in zip(self.w_param_ga_list, self.g_param_ga_list): self.comm.Sendrecv( sendbuf=[bufint(w_param_ga), MPI.FLOAT], dest=self.dest, recvbuf=[bufint(g_param_ga), MPI.FLOAT], source=self.dest, ) # copy weight from w_param_ga to w_param for g_param, g_param_ga in \ zip(self.g_param_list, self.g_param_ga_list): param_ga = \ theano.misc.pycuda_utils.to_gpuarray(g_param.container.value) self.drv.memcpy_dtod( param_ga.ptr, g_param_ga.ptr, g_param_ga.dtype.itemsize * g_param_ga.size) self.update_func() self.comm = None