def c_code(self, node, nodename, inputs, outputs, sub): W, b, d, H, RShape = inputs fail = sub['fail'] R = outputs[0] codeSource = """ ///////////// < code generated by GpuConvTransp3D > //printf("\t\t\t\tGpuConvTransp c code\\n"); //Check dimensionality of inputs if (%(H)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd); %(fail)s } if (%(W)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor"); %(fail)s } if (%(b)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector"); %(fail)s } if (%(d)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector"); %(fail)s } //Read and check stride arguments if (%(d)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]); %(fail)s } { // for fail const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); %(fail)s } //Read and check sizes of inputs { // for fail const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0]; const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0]; if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels) { PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]); %(fail)s } { // for fail const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4]; if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels ); %(fail)s } { // for fail const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1]; const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2]; const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3]; const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1]; const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2]; const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3]; int videoHeight = (outputHeight-1) * dr + filterHeight; int videoWidth = (outputWidth-1) * dc + filterWidth; int videoDur = (outputDur-1) * dt + filterDur; if (%(RShape)s) { if (%(RShape)s->nd != 1) { PyErr_Format(PyExc_ValueError, "RShape must be a vector"); %(fail)s } if (%(RShape)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )"); %(fail)s } { // for fail dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0); dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1); dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2); if (RShape0 != -1) { if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur) { PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 ); %(fail)s } videoHeight = RShape0; videoWidth = RShape1; videoDur = RShape2; } } //Allocate the reconstruction npy_intp dims[5]; dims[0] = batchSize; dims[4] = inputChannels; dims[1] = videoHeight; dims[2] = videoWidth; dims[3] = videoDur; if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] || CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){ Py_XDECREF(%(R)s); %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); if (!(%(R)s)) { PyErr_Format(PyExc_MemoryError,"Could not allocate R"); %(fail)s; } } cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur); { // for fail bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s); int version = -1; int verbose = 0; bool subsample =(dr>1)||(dc>1)||(dt>1); bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1); printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]); bool work_complete = false; const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4]; const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3]; const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2]; const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1]; const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0]; const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4]; const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3]; const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2]; const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1]; const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0]; if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){ //conv_transp_rows_stack dim3 grid(batchSize * inputChannels, videoHeight * videoWidth); dim3 threads(videoDur); HERE int shared_size=0; conv_transp_rows_stack<<<grid, threads, shared_size>>>( CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s), videoHeight, videoWidth, videoDur, filterHeight, filterWidth, filterDur, outputHeight, outputWidth, outputDur, outputChannels, inputChannels, dr,dc,dt, hs3,hs2,hs1,hs4,hs0, ws3,ws2,ws1,ws4,ws0, CudaNdarray_HOST_STRIDES(%(b)s)[0]); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess == sts) { work_complete = true; if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n"); } else { if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts)); PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)", cudaGetErrorString(sts)); %(fail)s } } if(!work_complete){ PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d", out_contiguous,b_strided,outputDur); %(fail)s } }}}}}} // for fail ///////////// < /code generated by GpuConvTransp3D > """ return strutil.renderString(codeSource, locals())
def c_code(self, node, nodename, inputs, outputs, sub): V, W, b, d = inputs fail = sub['fail'] H = outputs[0] codeSource = """ ///////////// < code generated by Conv3D > //printf("\t\t\t\tConv3D c code\\n"); //Check dimensionality of inputs if (%(W)s->nd != 5) { PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor"); %(fail)s } if (%(V)s->nd != 5) { PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor"); %(fail)s } if (%(b)s->nd != 1) { PyErr_Format(PyExc_ValueError,"Conv3D: b must be a vector."); %(fail)s } if (%(d)s->nd != 1) { PyErr_Format(PyExc_ValueError,"Conv3D: d must be a vector."); %(fail)s } if (%(d)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError,"Conv3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)%(d)s->dimensions[0]); %(fail)s } //Read and check sizes of inputs { // exta scope so error handler jumps don't cause errors const int batchSize = %(V)s->dimensions[0]; const int outputChannels = %(W)s->dimensions[0]; const int inputChannels = %(V)s->dimensions[4]; if (%(W)s->dimensions[4] != inputChannels) { PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%ld channel image but the image has %%d channels. Overall shape of input: (%%ld,%%ld,%%ld,%%ld,%%ld)", (long)%(W)s->dimensions[4], inputChannels, (long)%(V)s->dimensions[0], (long)%(V)s->dimensions[1], (long)%(V)s->dimensions[2], (long)%(V)s->dimensions[3], (long)%(V)s->dimensions[4]); %(fail)s } if (%(b)s->dimensions[0] != outputChannels) { PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%ld channel output image but the output has %%d channels", (long)%(b)s->dimensions[0], outputChannels); %(fail)s } { //extra scope so error handler jumps don't cause errors const int filterHeight = %(W)s->dimensions[1]; const int filterWidth = %(W)s->dimensions[2]; const int filterDur = %(W)s->dimensions[3]; const int vidHeight = %(V)s->dimensions[1]; const int vidWidth = %(V)s->dimensions[2]; const int vidDur = %(V)s->dimensions[3];\ if (vidHeight < filterHeight) { PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight); %(fail)s } { // extra scope so fail works if (vidWidth < filterWidth) { PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth); %(fail)s } { // extra scope so fail works if (vidDur < filterDur) { PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur); %(fail)s } { // extra scope so fail works //Read and check stride arguments const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0); const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1); const int dt = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError,"Conv3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); %(fail)s } { // extra scope so fail works //Make correctly sized output const long long outputHeight = int( (vidHeight - filterHeight) / dr )+1; const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1; const long long outputDur = int( (vidDur - filterDur) / dt ) +1; npy_intp dims[5]; dims[0] = batchSize; dims[4] = outputChannels; dims[1] = outputHeight; dims[2] = outputWidth; dims[3] = outputDur; if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] || %(H)s->dimensions[1]!=dims[1] || %(H)s->dimensions[2]!=dims[2] || %(H)s->dimensions[3]!=dims[3] || %(H)s->dimensions[4]!=dims[4]){ Py_XDECREF(%(H)s); %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num); if (!(%(H)s)) { PyErr_Format(PyExc_MemoryError,"Conv3D: Could not allocate output."); %(fail)s } } { // extra scope so fail works #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) ) const int ws0 = %(W)s->strides[0]; const int ws1 = %(W)s->strides[1]; const int ws2 = %(W)s->strides[2]; const int vs1 = %(V)s->strides[1]; const int ws4 = %(W)s->strides[4]; const int vs4 = %(V)s->strides[4]; const int ws3 = %(W)s->strides[3]; const int vs3 = %(V)s->strides[3]; const int vs2 = %(V)s->strides[2]; const int bs = %(b)s->strides[0]; const int hs4 = %(H)s->strides[4]; // Compute H //H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m] //TODO: add special cases // ex: filterDur == 1 && batchSize == 1 && dt = 1 (for SFA) // ex: inputChannels == 1 """ #if the data types are not mixed, we can insert special case optimizations based on BLAS VV, WV, bv, dv = node.inputs HV = node.outputs[0] if VV.dtype == WV.dtype and HV.dtype == VV.dtype: if VV.dtype == 'float64': gemv = 'dgemv_' elif VV.dtype == 'float32': gemv = 'sgemv_' else: raise Exception('Unrecognized dtype for convolution '+V.value.dtype) codeSource += """ if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0))) { //std::cout << "lots of channels special case code" << std::endl; #define blas_type dtype_ ## %(V)s const blas_type constant_one = 1.0; char N = 'T'; int ws0e = ws0 / sizeof(ELEM_AT(%(W)s,0)); int vs4e = vs4 / sizeof(ELEM_AT(%(V)s,4)); int hs4e = hs4 / sizeof(ELEM_AT(%(H)s,4)); //special case code for the "lots of channels" case //uses a BLAS matrix vector multiply to compute the contribute for //all channels of an input pixel to all channels of an output pixel //simultaneously long long Hpos = 0; long long Vpos = 0; for (int i = 0; i < batchSize; i++) { long long Hposi = Hpos; long long Vposi = Vpos; for (int r = 0; r < outputHeight; r++) { long long Hposr = Hpos; long long Vposr = Vpos; for (int c = 0; c < outputWidth; c++) { long long Hposc = Hpos; long long Vposc = Vpos; for (int t = 0; t < outputDur; t++) { long long Hpost = Hpos; long long Vpost = Vpos; //of the loops so far, j should be the innermost, because //each loop through j visits the same elements of V //this implies that the last index of H should be the j index //since V and H should have the same format, this means //z should be the last index in v, and therefore the innermost //of the next set of for loops int Wpos = 0; int bPos = 0; long long Hposj = Hpos; for (int j = 0; j < outputChannels; j++) { // H[i,r,c,t,j] = b[j] ELEM_AT(%(H)s,Hposj) = ELEM_AT(%(b)s,bPos); Hposj += hs4; bPos += bs; } dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos); for (int k =0; k < filterHeight; k++) { int Wposk = Wpos; long long Vposk = Vpos; for (int l = 0; l < filterWidth; l++) { int Wposl = Wpos; long long Vposl = Vpos; for (int m = 0; m < filterDur; m++) { //H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:]) //note: changing the weights so that outputChannels and inputChannels were the last two rather than //the first and last elements did not speed this up, even for extremely large input sizes %(gemv)s(&N, & inputChannels, & outputChannels, &constant_one, & ELEM_AT( %(W)s , Wpos),& ws0e, & ELEM_AT(%(V)s, Vpos),& vs4e, &constant_one, writePos,& hs4e); Wpos += ws3; Vpos += vs3; } // close m Wpos = Wposl + ws2; Vpos = Vposl + vs2; } //close l Wpos = Wposk + %(W)s->strides[1]; Vpos = Vposk + %(V)s->strides[1]; } //close k Hpos = Hpost + %(H)s->strides[3]; Vpos = Vpost + vs3 * dt; } //close t Hpos = Hposc + %(H)s->strides[2]; Vpos = Vposc + vs2 * dc; } //close c Hpos = Hposr + %(H)s->strides[1]; Vpos = Vposr + %(V)s->strides[1] * dr; } //closes r Hpos = Hposi + %(H)s->strides[0]; Vpos = Vposi + %(V)s->strides[0]; } //closes i } //closes "lots of channels" special case code else """ codeSource += """ { //General case code //std::cout << "general case code" << std::endl; long long Hpos = 0; long long Vpos = 0; for (int i = 0; i < batchSize; i++) { long long Hposi = Hpos; long long Vposi = Vpos; for (int r = 0; r < outputHeight; r++) { long long Hposr = Hpos; long long Vposr = Vpos; for (int c = 0; c < outputWidth; c++) { long long Hposc = Hpos; long long Vposc = Vpos; for (int t = 0; t < outputDur; t++) { long long Hpost = Hpos; long long Vpost = Vpos; //of the loops so far, j should be the innermost, because //each loop through j visits the same elements of V //this implies that the last index of H should be the j index //since V and H should have the same format, this means //z should be the last index in v, and therefore the innermost //of the next set of for loops int Wpos = 0; int bPos = 0; for (int j = 0; j < outputChannels; j++) { long long Hposj = Hpos; long long Vposj = Vpos; int Wposj = Wpos; // H[i,r,c,t,j] = b[j] dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos); writePos = ELEM_AT(%(b)s,bPos); for (int k =0; k < filterHeight; k++) { int Wposk = Wpos; long long Vposk = Vpos; for (int l = 0; l < filterWidth; l++) { int Wposl = Wpos; long long Vposl = Vpos; for (int m = 0; m < filterDur; m++) { int Wposm = Wpos; long long Vposm = Vpos; for (int z = 0; z < inputChannels; z++) { //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z] writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos); Wpos += ws4; Vpos += vs4; } // close z Wpos = Wposm + ws3; Vpos = Vposm + vs3; } // close m Wpos = Wposl + ws2; Vpos = Vposl + vs2; } //close l Wpos = Wposk + %(W)s->strides[1]; Vpos = Vposk + %(V)s->strides[1]; } //close k bPos += bs; Wpos = Wposj + ws0; Hpos = Hposj + hs4; Vpos = Vposj; //std::cout << "incremented Wpos by " << ws0 << std::endl; //std::cout << "incremented Hpos by " << hs4 << std::endl; } //close j Hpos = Hpost + %(H)s->strides[3]; Vpos = Vpost + vs3 * dt; } //close t Hpos = Hposc + %(H)s->strides[2]; Vpos = Vposc + vs2 * dc; } //close c Hpos = Hposr + %(H)s->strides[1]; Vpos = Vposr + %(V)s->strides[1] * dr; } //closes r Hpos = Hposi + %(H)s->strides[0]; Vpos = Vposi + %(V)s->strides[0]; } //closes i } //closes general case code }}}}}}} //extra scope so error handler jumps don't cross declarations ///////////// < /code generated by Conv3D > """ return strutil.renderString(codeSource,locals())
def c_code(self, node, nodename, inputs, outputs, sub): V, d, WShape, dCdH = inputs fail = sub['fail'] dCdW = outputs[0] codeSource = """ ///////////// < code generated by ConvGradW3D > //printf("\t\t\t\tConvGradW3D c code\\n"); //Check dimensionality of inputs if (PyArray_NDIM(%(dCdH)s) != 5) { PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor"); %(fail)s } if (PyArray_NDIM(%(V)s) != 5) { PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor"); %(fail)s } if (PyArray_NDIM(%(WShape)s) != 1) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be a vector."); %(fail)s } if (PyArray_NDIM(%(d)s) != 1) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: d must be a vector."); %(fail)s } if (PyArray_DIMS(%(d)s)[0] != 3) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0]); %(fail)s } { //extra scope so that fail will not jump over declarations //Read and check sizes of inputs const int batchSize = PyArray_DIMS(%(V)s)[0]; if (PyArray_DIMS(%(WShape)s)[0] != 5) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must specify a 5D shape"); %(fail)s } if (!PyArray_ISCONTIGUOUS(%(WShape)s)) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous"); %(fail)s } { //extra scope so that fail will not jump over declarations dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) PyArray_DATA(%(WShape)s); const int outputChannels = WShape[0]; const int inputChannels = PyArray_DIMS(%(V)s)[4]; if (WShape[4] != inputChannels) { PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels); %(fail)s } { //extra scope so fail works const int filterHeight = WShape[1]; const int filterWidth = WShape[2]; const int filterDur = WShape[3]; const int vidHeight = PyArray_DIMS(%(V)s)[1]; const int vidWidth = PyArray_DIMS(%(V)s)[2]; const int vidDur = PyArray_DIMS(%(V)s)[3]; if (vidHeight < filterHeight) { PyErr_Format(PyExc_ValueError, "ConvGrad3D: W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight); %(fail)s } if (vidWidth < filterWidth) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a width of %%i but V is only %%i pixels tall",filterWidth,vidWidth); %(fail)s } if (vidDur < filterDur) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur); %(fail)s } { // extra scope so fail works //Read and check stride arguments const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError,"ConvGrad3D: Strides should all be positive but they are %%i, %%i, %%i",dr,dc,dt); %(fail)s } { // extra scope so fail works //Compute correct sized of output const int outputHeight = int( (vidHeight - filterHeight) / dr )+1; const int outputWidth = int( (vidWidth - filterWidth) / dc )+1; const int outputDur = int( (vidDur - filterDur) / dt ) +1; if (PyArray_DIMS(%(dCdH)s)[0] != batchSize || PyArray_DIMS(%(dCdH)s)[4] != outputChannels || PyArray_DIMS(%(dCdH)s)[1] != outputHeight || PyArray_DIMS(%(dCdH)s)[2] != outputWidth || PyArray_DIMS(%(dCdH)s)[3] != outputDur) { PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%li,%%li,%%li,%%li,%%li)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, (long)PyArray_DIMS(%(dCdH)s)[0], (long)PyArray_DIMS(%(dCdH)s)[1], (long)PyArray_DIMS(%(dCdH)s)[2], (long)PyArray_DIMS(%(dCdH)s)[3], (long)PyArray_DIMS(%(dCdH)s)[4]); %(fail)s } { // extra scope for fail npy_intp dims[5]; dims[0] = outputChannels; dims[4] = inputChannels; dims[1] = filterHeight; dims[2] = filterWidth; dims[3] = filterDur; if(!(%(dCdW)s) || PyArray_DIMS(%(dCdW)s)[0]!=dims[0] || PyArray_DIMS(%(dCdW)s)[1]!=dims[1] || PyArray_DIMS(%(dCdW)s)[2]!=dims[2] || PyArray_DIMS(%(dCdW)s)[3]!=dims[3] || PyArray_DIMS(%(dCdW)s)[4]!=dims[4] ){ Py_XDECREF(%(dCdW)s); %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num); if (!(%(dCdW)s)) { PyErr_Format(PyExc_MemoryError,"ConvGrad3D: Could not allocate dCdW"); %(fail)s } } { //extra scope so fail works #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_DATA(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] ) #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) ) const int dhs3 = PyArray_STRIDES(%(dCdH)s)[3]; const int dtvs3 = dt * PyArray_STRIDES(%(V)s)[3]; // Compute dCdW //TODO-- see if this can be made faster by using ELEM_AT instead of ELEM5 // dCdW[j,k,l,m,z] = sum_i sum_p sum_q sum_r dCdH[i,p,q,r,j] * V[i,dr*p+k,dc*q+l,dt*r+m,z] for (int j = 0; j < outputChannels; j++) { for (int z = 0; z < inputChannels; z++) { for (int k = 0; k < filterHeight; k++) { for (int l = 0; l < filterWidth; l++) { for (int m = 0; m < filterDur; m++) { //printf("writePos %%i %%i %%i %%i %%i \\n",j,k,l,m,z); dtype_%(dCdW)s & writePos = ELEM5(%(dCdW)s, j,k,l,m,z); writePos = 0; for (int i = 0; i < batchSize; i++) { for (int p = 0; p < outputHeight; p++) { for (int q = 0; q < outputWidth; q++) { int Hpos = i * PyArray_STRIDES(%(dCdH)s)[0] + j * PyArray_STRIDES(%(dCdH)s)[4] + p * PyArray_STRIDES(%(dCdH)s)[1] + q * PyArray_STRIDES(%(dCdH)s)[2] ; int Vpos = i * PyArray_STRIDES(%(V)s)[0] + z * PyArray_STRIDES(%(V)s)[4] + (dr * p+k) * PyArray_STRIDES(%(V)s)[1] + (dc*q+l) * PyArray_STRIDES(%(V)s)[2] + m * PyArray_STRIDES(%(V)s)[3]; for (int r = 0; r < outputDur; r++) { writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z); //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos); Hpos += dhs3; Vpos += dtvs3; } } } } } } } } } }}}}}}} // extra scope for fail ///////////// < /code generated by ConvGradW3D > """ return strutil.renderString(codeSource, locals())
def c_code(self, node, nodename, inputs, outputs, sub): W, b, d, H, RShape = inputs fail = sub['fail'] R = outputs[0] codeSource = """ ///////////// < code generated by ConvTransp3D > //printf("\t\t\t\tConvTransp3D c code\\n"); //Check dimensionality of inputs if (%(H)s->nd != 5) { PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd); %(fail)s } if (%(W)s->nd != 5) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor"); %(fail)s } if (%(b)s->nd != 1) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector"); %(fail)s } if (%(d)s->nd != 1) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector"); %(fail)s } //Read and check stride arguments if (%(d)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", (long)%(d)s->dimensions[0] ); %(fail)s } { // for fail 1 int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); %(fail)s } //Read and check sizes of inputs { // for fail 2 const int batchSize = %(H)s->dimensions[0]; const int outputChannels = %(W)s->dimensions[0]; if (%(H)s->dimensions[4] != outputChannels) { PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li, %%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)", outputChannels, (long)%(H)s->dimensions[4], (long)%(W)s->dimensions[0], (long)%(W)s->dimensions[1], (long)%(W)s->dimensions[2], (long)%(W)s->dimensions[3], (long)%(W)s->dimensions[4], (long)%(H)s->dimensions[0], (long)%(H)s->dimensions[1], (long)%(H)s->dimensions[2], (long)%(H)s->dimensions[3], (long)%(H)s->dimensions[4]); %(fail)s } { // for fail 3 const int inputChannels = %(W)s->dimensions[4]; if (%(b)s->dimensions[0] != inputChannels) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%li channel image but the image has %%i channels", (long)%(b)s->dimensions[0], inputChannels ); %(fail)s } { // for fail 4 const int filterHeight = %(W)s->dimensions[1]; const int filterWidth = %(W)s->dimensions[2]; const int filterDur = %(W)s->dimensions[3]; const int outputHeight = %(H)s->dimensions[1]; const int outputWidth = %(H)s->dimensions[2]; const int outputDur = %(H)s->dimensions[3]; int videoHeight = (outputHeight-1) * dr + filterHeight; int videoWidth = (outputWidth-1) * dc + filterWidth; int videoDur = (outputDur-1) * dt + filterDur; if (%(RShape)s) { if (%(RShape)s->nd != 1) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector"); %(fail)s } if (%(RShape)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )"); %(fail)s } dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0); dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1); dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2); if (RShape0 != -1) { if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur) { PyErr_Format(PyExc_ValueError, "Reconstruction must have physical shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]\\n",videoHeight,videoWidth,videoDur,(int) RShape0,(int) RShape1,(int) RShape2); %(fail)s } videoHeight = RShape0; videoWidth = RShape1; videoDur = RShape2; } } //closes if RShape { // for fail 5 //Allocate the reconstruction npy_intp dims[5]; dims[0] = batchSize; dims[4] = inputChannels; dims[1] = videoHeight; dims[2] = videoWidth; dims[3] = videoDur; if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] || %(R)s->dimensions[1]!=dims[1] || %(R)s->dimensions[2]!=dims[2] || %(R)s->dimensions[3]!=dims[3] || %(R)s->dimensions[4]!=dims[4]) { Py_XDECREF(%(R)s); %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(H)s->descr->type_num); if (!(%(R)s)) { PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R"); %(fail)s } } { // for fail 6 #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] ) #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) ) dtype_%(b)s * b = (dtype_%(b)s *) %(b)s->data; int rs4 = %(R)s->strides[4]; int ws0 = %(W)s->strides[0]; int ws4 = %(W)s->strides[4]; int hs4 = %(H)s->strides[4]; // Compute R // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k] for (int i = 0; i < batchSize; i++) { for (int r = 0; r < videoHeight; r++) { const int frc = (int)std::max(0.0f, ceilf(float(r-filterHeight+1)/float(dr))); for (int c = 0; c < videoWidth; c++) { const int fcc = (int)std::max(0.0f, ceilf(float(c-filterWidth +1)/float(dc))); for (int t = 0; t < videoDur; t++) { const int ftc = (int)std::max(0.0f, ceilf(float(t-filterDur +1) /float(dt))); long long Rpost = i * %(R)s->strides[0] + r * %(R)s->strides[1] + c * %(R)s->strides[2] + t * %(R)s->strides[3]; long long Rpos = Rpost; for (int j = 0; j < inputChannels; j++) { //ELEM5(%(R)s, i,r,c,t,j) = b[j]; ELEM_AT(%(R)s,Rpos) = b[j]; Rpos += rs4; } for (int rc = frc; rc < outputHeight; rc++) { const int rk = r - rc * dr; if (rk < 0) break; for (int cc = fcc; cc < outputWidth; cc++) { const int ck = c - cc * dc; if (ck < 0) break; for (int tc = ftc; tc < outputDur; tc++) { const int tk = t - tc * dt; if (tk < 0) break; int Wpos = rk * %(W)s->strides[1] + ck * %(W)s->strides[2] + tk * %(W)s->strides[3]; int Hpostc = i * %(H)s->strides[0] + rc * %(H)s->strides[1] + cc * %(H)s->strides[2] + tc * %(H)s->strides[3]; Rpos = Rpost; for (int j = 0; j < inputChannels; j++) { int Wposj = Wpos; dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos); int Hpos = Hpostc; for (int k = 0; k < outputChannels; k++) { //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k); //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos); writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos); Wpos += ws0; Hpos += hs4; } //close the k loop Rpos += rs4; Wpos = Wposj + ws4; } //close the j loop } // close the tc loop } //cc } //rc } //t } //c } //r } //i } //for fail 6 } //for fail 5 } //for fail 4 } //for fail 3 } //for fail 2 } // for fail 1 ///////////// < /code generated by ConvTransp3D > """ return strutil.renderString(codeSource,locals())
def c_code(self, node, name, inputs, outputs, sub): partial_sum = self.partial_sum if self.partial_sum is not None else 0 images, hid_grads = inputs weights_grads, = outputs fail = sub['fail'] pad = self.pad # convFilterActs will multiply targets by scaleTargets # then add scaleOutput * (the convolution value) # We could make use of this to implement an inplace # addconv op but for this op we just want to compute # the convolution so we set them to 0 and 1 respectively # Note: there is another version of convFilterActs that # does not take these arguments, but it is just a wrapper # around the version that does take them, so we save # a function call by using the version that we use. basic_setup = """ #define scaleTargets 0 #define scaleOutput 1 """ if self.dense_connectivity: basic_setup += """ #define numGroups 1 """ basic_setup += """ #define paddingStart (-%(pad)d) const int *hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s); const int hidGradsSizeY = hid_grads_dims[1]; const int hidGradsSizeX = hid_grads_dims[2]; const int numModules = hidGradsSizeX * hidGradsSizeY; int partialSum = %(partial_sum)d > 0 ? %(partial_sum)d : numModules; if (numModules %% partialSum > 0) { PyErr_Format(PyExc_ValueError, "partialSum must divide numModules, but partialSum=%%d and " "numModules=%%d", partialSum, numModules); %(fail)s; } """ if self.stride != 1: raise UnimplementedError() else: basic_setup += """ #define moduleStride 1 """ if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup += "#define WEIGHTACTS_COPY_NON_CONTIGUOUS 0\n" # The amount of braces that must be closed at the end num_braces = 0 # Convert images int nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_images = self._argument_contiguity_check("images") + """ if (%(images)s->nd != 4) { PyErr_Format(PyExc_ValueError, "images must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s); const int img_channels = images_dims[0]; if (img_channels > 3 && img_channels %% 4 != 0) { PyErr_Format(PyExc_ValueError, "images must have 3 or fewer channels, or have a multiple of 4 channels, got %%i", img_channels); %(fail)s; } { //setup_nv_images brace 2 const int * hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s); const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "weight_acts: nv_images"); """ num_braces += 2 # Convert hid_grads int nv_hid_grads, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_hid_grads = self._argument_contiguity_check("hid_grads") + """ if (%(hid_grads)s->nd != 4) { PyErr_Format(PyExc_ValueError, "hid_grads must have nd=4, got nd=%%i", %(hid_grads)s->nd); %(fail)s; } { //setup_nv_hid_grads brace 1 const int numFilters = hid_grads_dims[0]; const int batch_size = hid_grads_dims[3]; NVMatrix nv_hid_grads(%(hid_grads)s, numFilters * hidGradsSizeY * hidGradsSizeX, batch_size, "weight_acts:nv_hid_grads"); """ num_braces += 1 setup_nv_weights_grads = """ int filters_dims[4]; // filters: (input channels, filter rows, filter cols, output channels) filters_dims[0] = img_channels; filters_dims[1] = imgSizeY - hidGradsSizeY + 1 - 2 * paddingStart; filters_dims[2] = imgSizeX - hidGradsSizeX + 1 - 2 * paddingStart; assert(filters_dims[1] == filters_dims[2]); // only square kernels are supported filters_dims[3] = numFilters; const int filterSize = filters_dims[1]; int partialsum_storage_dims[5]; for (int i = 1; i < 5; i++) { partialsum_storage_dims[i] = filters_dims[i - 1]; } partialsum_storage_dims[0] = numModules / partialSum; CudaNdarray *partialsum_storage = NULL; if (partialSum != numModules && CudaNdarray_prep_output(&partialsum_storage, 5, partialsum_storage_dims)) { %(fail)s; } for (int i = 0; i < 4; i++) { if (filters_dims[i] <= 0) { printf("filters_dims[%%d] = %%d\\n", i, filters_dims[i]); assert(false); } } if (CudaNdarray_prep_output(& %(weights_grads)s, 4, filters_dims)) { Py_DECREF(partialsum_storage); %(fail)s; } { // setup_nv_weights_grad brace # 1 NVMatrix nv_weights_grads(%(weights_grads)s, filters_dims[0] * filterSize * filterSize, numFilters, "weight_acts:nv_weights_grads"); """ num_braces += 1 # note: imgSizeX is not specified here, it is computed internally # (in _filterActsSparse) by the lines: # int imgPixels = images.getNumRows() / numImgColors; # int imgSizeX = imgPixels / imgSizeY; # # note: numFilters is not specified here. it is determined by # nv_filters.getNumCols() # # note: the size of the filters is determined by dividing # nv_filters.getNumRows() by numFilterColors # run_kernel = """ if (partialSum == numModules) _weightActs(nv_images, nv_hid_grads, nv_weights_grads, imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize, paddingStart, moduleStride, img_channels, numGroups, partialSum, 0, 1); else { NVMatrix nv_partialsum(partialsum_storage, (numModules / partialSum) * filters_dims[0] * filterSize * filterSize, numFilters, "weight_acts: nv_partialsum"); _weightActs(nv_images, nv_hid_grads, nv_partialsum, imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize, paddingStart, moduleStride, img_channels, numGroups, partialSum, 0, 1); nv_partialsum.reshape((numModules / partialSum), filters_dims[0] * filterSize * filterSize * numFilters); // sum out axis 0 of nv_partialsum #define AXIS 0 // scale the contents of nv_weights_grads by 0 // i.e., clear out its pre-existing content #define SCALE_THIS 0 // scale the new sum by 1, i.e., don't do any scaling #define SCALE_SUM 1 nv_weights_grads.addSum(nv_partialsum, AXIS, SCALE_THIS, SCALE_SUM); Py_DECREF(partialsum_storage); } """ braces = '}' * num_braces rval = (basic_setup + setup_nv_images + setup_nv_hid_grads + setup_nv_weights_grads + run_kernel + braces) rval = renderString(rval, locals()) return rval
def c_code(self, node, nodename, inputs, outputs, sub): W, b, d, H, RShape = inputs fail = sub['fail'] R = outputs[0] codeSource = """ ///////////// < code generated by ConvTransp3D > //printf("\t\t\t\tConvTransp3D c code\\n"); //Check dimensionality of inputs if (PyArray_NDIM(%(H)s) != 5) { PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D", PyArray_NDIM(%(H)s)); %(fail)s } if (PyArray_NDIM(%(W)s) != 5) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor"); %(fail)s } if (PyArray_NDIM(%(b)s) != 1) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector"); %(fail)s } if (PyArray_NDIM(%(d)s) != 1) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector"); %(fail)s } //Read and check stride arguments if (PyArray_DIMS(%(d)s)[0] != 3) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0] ); %(fail)s } { // for fail 1 int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); %(fail)s } //Read and check sizes of inputs { // for fail 2 const int batchSize = PyArray_DIMS(%(H)s)[0]; const int outputChannels = PyArray_DIMS(%(W)s)[0]; if (PyArray_DIMS(%(H)s)[4] != outputChannels) { PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li, %%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)", outputChannels, (long)PyArray_DIMS(%(H)s)[4], (long)PyArray_DIMS(%(W)s)[0], (long)PyArray_DIMS(%(W)s)[1], (long)PyArray_DIMS(%(W)s)[2], (long)PyArray_DIMS(%(W)s)[3], (long)PyArray_DIMS(%(W)s)[4], (long)PyArray_DIMS(%(H)s)[0], (long)PyArray_DIMS(%(H)s)[1], (long)PyArray_DIMS(%(H)s)[2], (long)PyArray_DIMS(%(H)s)[3], (long)PyArray_DIMS(%(H)s)[4]); %(fail)s } { // for fail 3 const int inputChannels = PyArray_DIMS(%(W)s)[4]; if (PyArray_DIMS(%(b)s)[0] != inputChannels) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%li channel image but the image has %%i channels", (long)PyArray_DIMS(%(b)s)[0], inputChannels ); %(fail)s } { // for fail 4 const int filterHeight = PyArray_DIMS(%(W)s)[1]; const int filterWidth = PyArray_DIMS(%(W)s)[2]; const int filterDur = PyArray_DIMS(%(W)s)[3]; const int outputHeight = PyArray_DIMS(%(H)s)[1]; const int outputWidth = PyArray_DIMS(%(H)s)[2]; const int outputDur = PyArray_DIMS(%(H)s)[3]; int videoHeight = (outputHeight-1) * dr + filterHeight; int videoWidth = (outputWidth-1) * dc + filterWidth; int videoDur = (outputDur-1) * dt + filterDur; if (%(RShape)s) { if (PyArray_NDIM(%(RShape)s) != 1) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector"); %(fail)s } if (PyArray_DIMS(%(RShape)s)[0] != 3) { PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )"); %(fail)s } dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0); dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1); dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2); if (RShape0 != -1) { if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur) { PyErr_Format(PyExc_ValueError, "Reconstruction must have physical shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]\\n",videoHeight,videoWidth,videoDur,(int) RShape0,(int) RShape1,(int) RShape2); %(fail)s } videoHeight = RShape0; videoWidth = RShape1; videoDur = RShape2; } } //closes if RShape { // for fail 5 //Allocate the reconstruction npy_intp dims[5]; dims[0] = batchSize; dims[4] = inputChannels; dims[1] = videoHeight; dims[2] = videoWidth; dims[3] = videoDur; if(!(%(R)s) || PyArray_DIMS(%(R)s)[0]!=dims[0] || PyArray_DIMS(%(R)s)[1]!=dims[1] || PyArray_DIMS(%(R)s)[2]!=dims[2] || PyArray_DIMS(%(R)s)[3]!=dims[3] || PyArray_DIMS(%(R)s)[4]!=dims[4]) { Py_XDECREF(%(R)s); %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(H)s)->type_num); if (!(%(R)s)) { PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R"); %(fail)s } } { // for fail 6 #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_DATA(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] ) #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) ) dtype_%(b)s * b = (dtype_%(b)s *) PyArray_DATA(%(b)s); int rs4 = PyArray_STRIDES(%(R)s)[4]; int ws0 = PyArray_STRIDES(%(W)s)[0]; int ws4 = PyArray_STRIDES(%(W)s)[4]; int hs4 = PyArray_STRIDES(%(H)s)[4]; // Compute R // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k] for (int i = 0; i < batchSize; i++) { for (int r = 0; r < videoHeight; r++) { const int frc = (int)std::max(0.0f, ceilf(float(r-filterHeight+1)/float(dr))); for (int c = 0; c < videoWidth; c++) { const int fcc = (int)std::max(0.0f, ceilf(float(c-filterWidth +1)/float(dc))); for (int t = 0; t < videoDur; t++) { const int ftc = (int)std::max(0.0f, ceilf(float(t-filterDur +1) /float(dt))); long long Rpost = i * PyArray_STRIDES(%(R)s)[0] + r * PyArray_STRIDES(%(R)s)[1] + c * PyArray_STRIDES(%(R)s)[2] + t * PyArray_STRIDES(%(R)s)[3]; long long Rpos = Rpost; for (int j = 0; j < inputChannels; j++) { //ELEM5(%(R)s, i,r,c,t,j) = b[j]; ELEM_AT(%(R)s,Rpos) = b[j]; Rpos += rs4; } for (int rc = frc; rc < outputHeight; rc++) { const int rk = r - rc * dr; if (rk < 0) break; for (int cc = fcc; cc < outputWidth; cc++) { const int ck = c - cc * dc; if (ck < 0) break; for (int tc = ftc; tc < outputDur; tc++) { const int tk = t - tc * dt; if (tk < 0) break; int Wpos = rk * PyArray_STRIDES(%(W)s)[1] + ck * PyArray_STRIDES(%(W)s)[2] + tk * PyArray_STRIDES(%(W)s)[3]; int Hpostc = i * PyArray_STRIDES(%(H)s)[0] + rc * PyArray_STRIDES(%(H)s)[1] + cc * PyArray_STRIDES(%(H)s)[2] + tc * PyArray_STRIDES(%(H)s)[3]; Rpos = Rpost; for (int j = 0; j < inputChannels; j++) { int Wposj = Wpos; dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos); int Hpos = Hpostc; for (int k = 0; k < outputChannels; k++) { //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k); //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos); writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos); Wpos += ws0; Hpos += hs4; } //close the k loop Rpos += rs4; Wpos = Wposj + ws4; } //close the j loop } // close the tc loop } //cc } //rc } //t } //c } //r } //i } //for fail 6 } //for fail 5 } //for fail 4 } //for fail 3 } //for fail 2 } // for fail 1 ///////////// < /code generated by ConvTransp3D > """ return strutil.renderString(codeSource, locals())
def c_code(self, node, nodename, inputs, outputs, sub): V, W, b, d = inputs fail = sub['fail'] H = outputs[0] codeSource = """ ///////////// < code generated by Conv3D > //printf("\t\t\t\tConv3D c code\\n"); //Check dimensionality of inputs if (PyArray_NDIM(%(W)s) != 5) { PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor"); %(fail)s } if (PyArray_NDIM(%(V)s) != 5) { PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor"); %(fail)s } if (PyArray_NDIM(%(b)s) != 1) { PyErr_Format(PyExc_ValueError,"Conv3D: b must be a vector."); %(fail)s } if (PyArray_NDIM(%(d)s) != 1) { PyErr_Format(PyExc_ValueError,"Conv3D: d must be a vector."); %(fail)s } if (PyArray_DIMS(%(d)s)[0] != 3) { PyErr_Format(PyExc_ValueError,"Conv3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0]); %(fail)s } //Read and check sizes of inputs { // exta scope so error handler jumps don't cause errors const int batchSize = PyArray_DIMS(%(V)s)[0]; const int outputChannels = PyArray_DIMS(%(W)s)[0]; const int inputChannels = PyArray_DIMS(%(V)s)[4]; if (PyArray_DIMS(%(W)s)[4] != inputChannels) { PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%ld channel image but the image has %%d channels. Overall shape of input: (%%ld,%%ld,%%ld,%%ld,%%ld)", (long)PyArray_DIMS(%(W)s)[4], inputChannels, (long)PyArray_DIMS(%(V)s)[0], (long)PyArray_DIMS(%(V)s)[1], (long)PyArray_DIMS(%(V)s)[2], (long)PyArray_DIMS(%(V)s)[3], (long)PyArray_DIMS(%(V)s)[4]); %(fail)s } if (PyArray_DIMS(%(b)s)[0] != outputChannels) { PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%ld channel output image but the output has %%d channels", (long)PyArray_DIMS(%(b)s)[0], outputChannels); %(fail)s } { //extra scope so error handler jumps don't cause errors const int filterHeight = PyArray_DIMS(%(W)s)[1]; const int filterWidth = PyArray_DIMS(%(W)s)[2]; const int filterDur = PyArray_DIMS(%(W)s)[3]; const int vidHeight = PyArray_DIMS(%(V)s)[1]; const int vidWidth = PyArray_DIMS(%(V)s)[2]; const int vidDur = PyArray_DIMS(%(V)s)[3];\ if (vidHeight < filterHeight) { PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight); %(fail)s } { // extra scope so fail works if (vidWidth < filterWidth) { PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth); %(fail)s } { // extra scope so fail works if (vidDur < filterDur) { PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur); %(fail)s } { // extra scope so fail works //Read and check stride arguments const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0); const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1); const int dt = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError,"Conv3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); %(fail)s } { // extra scope so fail works //Make correctly sized output const long long outputHeight = int( (vidHeight - filterHeight) / dr )+1; const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1; const long long outputDur = int( (vidDur - filterDur) / dt ) +1; npy_intp dims[5]; dims[0] = batchSize; dims[4] = outputChannels; dims[1] = outputHeight; dims[2] = outputWidth; dims[3] = outputDur; if(!(%(H)s) || PyArray_DIMS(%(H)s)[0]!=dims[0] || PyArray_DIMS(%(H)s)[1]!=dims[1] || PyArray_DIMS(%(H)s)[2]!=dims[2] || PyArray_DIMS(%(H)s)[3]!=dims[3] || PyArray_DIMS(%(H)s)[4]!=dims[4]){ Py_XDECREF(%(H)s); %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num); if (!(%(H)s)) { PyErr_Format(PyExc_MemoryError,"Conv3D: Could not allocate output."); %(fail)s } } { // extra scope so fail works #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) ) const int ws0 = PyArray_STRIDES(%(W)s)[0]; const int ws1 = PyArray_STRIDES(%(W)s)[1]; const int ws2 = PyArray_STRIDES(%(W)s)[2]; const int vs1 = PyArray_STRIDES(%(V)s)[1]; const int ws4 = PyArray_STRIDES(%(W)s)[4]; const int vs4 = PyArray_STRIDES(%(V)s)[4]; const int ws3 = PyArray_STRIDES(%(W)s)[3]; const int vs3 = PyArray_STRIDES(%(V)s)[3]; const int vs2 = PyArray_STRIDES(%(V)s)[2]; const int bs = PyArray_STRIDES(%(b)s)[0]; const int hs4 = PyArray_STRIDES(%(H)s)[4]; // Compute H //H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m] //TODO: add special cases // ex: filterDur == 1 && batchSize == 1 && dt = 1 (for SFA) // ex: inputChannels == 1 """ #if the data types are not mixed, we can insert special case optimizations based on BLAS VV, WV, bv, dv = node.inputs HV = node.outputs[0] if (theano.config.blas.ldflags and VV.dtype == WV.dtype and HV.dtype == VV.dtype): if VV.dtype == 'float64': gemv = 'dgemv_' elif VV.dtype == 'float32': gemv = 'sgemv_' else: raise Exception('Unrecognized dtype for convolution ' + V.value.dtype) codeSource += """ if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0))) { //std::cout << "lots of channels special case code" << std::endl; #define blas_type dtype_ ## %(V)s const blas_type constant_one = 1.0; char N = 'T'; int ws0e = ws0 / sizeof(ELEM_AT(%(W)s,0)); int vs4e = vs4 / sizeof(ELEM_AT(%(V)s,4)); int hs4e = hs4 / sizeof(ELEM_AT(%(H)s,4)); //special case code for the "lots of channels" case //uses a BLAS matrix vector multiply to compute the contribute for //all channels of an input pixel to all channels of an output pixel //simultaneously long long Hpos = 0; long long Vpos = 0; for (int i = 0; i < batchSize; i++) { long long Hposi = Hpos; long long Vposi = Vpos; for (int r = 0; r < outputHeight; r++) { long long Hposr = Hpos; long long Vposr = Vpos; for (int c = 0; c < outputWidth; c++) { long long Hposc = Hpos; long long Vposc = Vpos; for (int t = 0; t < outputDur; t++) { long long Hpost = Hpos; long long Vpost = Vpos; //of the loops so far, j should be the innermost, because //each loop through j visits the same elements of V //this implies that the last index of H should be the j index //since V and H should have the same format, this means //z should be the last index in v, and therefore the innermost //of the next set of for loops int Wpos = 0; int bPos = 0; long long Hposj = Hpos; for (int j = 0; j < outputChannels; j++) { // H[i,r,c,t,j] = b[j] ELEM_AT(%(H)s,Hposj) = ELEM_AT(%(b)s,bPos); Hposj += hs4; bPos += bs; } dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos); for (int k =0; k < filterHeight; k++) { int Wposk = Wpos; long long Vposk = Vpos; for (int l = 0; l < filterWidth; l++) { int Wposl = Wpos; long long Vposl = Vpos; for (int m = 0; m < filterDur; m++) { //H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:]) //note: changing the weights so that outputChannels and inputChannels were the last two rather than //the first and last elements did not speed this up, even for extremely large input sizes %(gemv)s(&N, & inputChannels, & outputChannels, &constant_one, & ELEM_AT( %(W)s , Wpos),& ws0e, & ELEM_AT(%(V)s, Vpos),& vs4e, &constant_one, writePos,& hs4e); Wpos += ws3; Vpos += vs3; } // close m Wpos = Wposl + ws2; Vpos = Vposl + vs2; } //close l Wpos = Wposk + PyArray_STRIDES(%(W)s)[1]; Vpos = Vposk + PyArray_STRIDES(%(V)s)[1]; } //close k Hpos = Hpost + PyArray_STRIDES(%(H)s)[3]; Vpos = Vpost + vs3 * dt; } //close t Hpos = Hposc + PyArray_STRIDES(%(H)s)[2]; Vpos = Vposc + vs2 * dc; } //close c Hpos = Hposr + PyArray_STRIDES(%(H)s)[1]; Vpos = Vposr + PyArray_STRIDES(%(V)s)[1] * dr; } //closes r Hpos = Hposi + PyArray_STRIDES(%(H)s)[0]; Vpos = Vposi + PyArray_STRIDES(%(V)s)[0]; } //closes i } //closes "lots of channels" special case code else """ codeSource += """ { //General case code //std::cout << "general case code" << std::endl; long long Hpos = 0; long long Vpos = 0; for (int i = 0; i < batchSize; i++) { long long Hposi = Hpos; long long Vposi = Vpos; for (int r = 0; r < outputHeight; r++) { long long Hposr = Hpos; long long Vposr = Vpos; for (int c = 0; c < outputWidth; c++) { long long Hposc = Hpos; long long Vposc = Vpos; for (int t = 0; t < outputDur; t++) { long long Hpost = Hpos; long long Vpost = Vpos; //of the loops so far, j should be the innermost, because //each loop through j visits the same elements of V //this implies that the last index of H should be the j index //since V and H should have the same format, this means //z should be the last index in v, and therefore the innermost //of the next set of for loops int Wpos = 0; int bPos = 0; for (int j = 0; j < outputChannels; j++) { long long Hposj = Hpos; long long Vposj = Vpos; int Wposj = Wpos; // H[i,r,c,t,j] = b[j] dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos); writePos = ELEM_AT(%(b)s,bPos); for (int k =0; k < filterHeight; k++) { int Wposk = Wpos; long long Vposk = Vpos; for (int l = 0; l < filterWidth; l++) { int Wposl = Wpos; long long Vposl = Vpos; for (int m = 0; m < filterDur; m++) { int Wposm = Wpos; long long Vposm = Vpos; for (int z = 0; z < inputChannels; z++) { //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z] writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos); Wpos += ws4; Vpos += vs4; } // close z Wpos = Wposm + ws3; Vpos = Vposm + vs3; } // close m Wpos = Wposl + ws2; Vpos = Vposl + vs2; } //close l Wpos = Wposk + PyArray_STRIDES(%(W)s)[1]; Vpos = Vposk + PyArray_STRIDES(%(V)s)[1]; } //close k bPos += bs; Wpos = Wposj + ws0; Hpos = Hposj + hs4; Vpos = Vposj; //std::cout << "incremented Wpos by " << ws0 << std::endl; //std::cout << "incremented Hpos by " << hs4 << std::endl; } //close j Hpos = Hpost + PyArray_STRIDES(%(H)s)[3]; Vpos = Vpost + vs3 * dt; } //close t Hpos = Hposc + PyArray_STRIDES(%(H)s)[2]; Vpos = Vposc + vs2 * dc; } //close c Hpos = Hposr + PyArray_STRIDES(%(H)s)[1]; Vpos = Vposr + PyArray_STRIDES(%(V)s)[1] * dr; } //closes r Hpos = Hposi + PyArray_STRIDES(%(H)s)[0]; Vpos = Vposi + PyArray_STRIDES(%(V)s)[0]; } //closes i } //closes general case code }}}}}}} //extra scope so error handler jumps don't cross declarations ///////////// < /code generated by Conv3D > """ return strutil.renderString(codeSource, locals())
def c_code(self, node, nodename, inputs, outputs, sub): W, b, d, H, RShape = inputs fail = sub['fail'] R = outputs[0] codeSource = """ ///////////// < code generated by GpuConvTransp3D > //printf("\t\t\t\tGpuConvTransp c code\\n"); //Check dimensionality of inputs if (%(H)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd); %(fail)s } if (%(W)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor"); %(fail)s } if (%(b)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector"); %(fail)s } if (%(d)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector"); %(fail)s } //Read and check stride arguments if (%(d)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]); %(fail)s } { // for fail const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); %(fail)s } //Read and check sizes of inputs { // for fail const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0]; const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0]; if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels) { PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]); %(fail)s } { // for fail const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4]; if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels) { PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels ); %(fail)s } { // for fail const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1]; const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2]; const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3]; const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1]; const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2]; const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3]; int videoHeight = (outputHeight-1) * dr + filterHeight; int videoWidth = (outputWidth-1) * dc + filterWidth; int videoDur = (outputDur-1) * dt + filterDur; if (%(RShape)s) { if (%(RShape)s->nd != 1) { PyErr_Format(PyExc_ValueError, "RShape must be a vector"); %(fail)s } if (%(RShape)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )"); %(fail)s } { // for fail dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0); dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1); dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2); if (RShape0 != -1) { if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur) { PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 ); %(fail)s } videoHeight = RShape0; videoWidth = RShape1; videoDur = RShape2; } } //Allocate the reconstruction npy_intp dims[5]; dims[0] = batchSize; dims[4] = inputChannels; dims[1] = videoHeight; dims[2] = videoWidth; dims[3] = videoDur; if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] || CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){ Py_XDECREF(%(R)s); %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); if (!(%(R)s)) { PyErr_Format(PyExc_MemoryError,"Could not allocate R"); %(fail)s; } } cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur); { // for fail bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s); int version = -1; int verbose = 0; bool subsample =(dr>1)||(dc>1)||(dt>1); bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1); printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]); bool work_complete = false; const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4]; const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3]; const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2]; const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1]; const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0]; const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4]; const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3]; const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2]; const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1]; const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0]; if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){ //conv_transp_rows_stack dim3 grid(batchSize * inputChannels, videoHeight * videoWidth); dim3 threads(videoDur); HERE int shared_size=0; conv_transp_rows_stack<<<grid, threads, shared_size>>>( CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s), videoHeight, videoWidth, videoDur, filterHeight, filterWidth, filterDur, outputHeight, outputWidth, outputDur, outputChannels, inputChannels, dr,dc,dt, hs3,hs2,hs1,hs4,hs0, ws3,ws2,ws1,ws4,ws0, CudaNdarray_HOST_STRIDES(%(b)s)[0]); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess == sts) { work_complete = true; if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n"); } else { if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts)); PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)", cudaGetErrorString(sts)); %(fail)s } } if(!work_complete){ PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d", out_contiguous,b_strided,outputDur); %(fail)s } }}}}}} // for fail ///////////// < /code generated by GpuConvTransp3D > """ return strutil.renderString(codeSource,locals())
def c_code(self, node, nodename, inputs, outputs, sub): V, W, b, d = inputs fail = sub['fail'] H = outputs[0] codeSource = """ ///////////// < code generated by GpuConv3D > //printf("\t\t\t\tConv3DGPU c code\\n"); //Check dimensionality of inputs if (%(W)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray"); %(fail)s } if (%(V)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray"); %(fail)s } if (%(b)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray"); %(fail)s } if (%(d)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); %(fail)s } if (%(d)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]); %(fail)s } { //extra scope so fail doesn't jump over declarations //Read and check sizes of inputs const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0]; const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0]; const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4]; if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels); %(fail)s } { //extra scope so error handler jumps don't cause errors const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1]; const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2]; const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3]; const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1]; const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2]; const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3]; if (vidHeight < filterHeight) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight); %(fail)s } { // extra scope so fail works if (vidWidth < filterWidth) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth); %(fail)s } { // extra scope so fail works if (vidDur < filterDur) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur); %(fail)s } { // extra scope so fail works //Read and check stride arguments const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt); %(fail)s } { // extra scope so fail works //Make correctly sized output const int outputHeight = int( (vidHeight - filterHeight) / dr )+1; const int outputWidth = int( (vidWidth - filterWidth) / dc )+1; const int outputDur = int( (vidDur - filterDur) / dt ) +1; npy_intp dims[5]; dims[0] = batchSize; dims[4] = outputChannels; dims[1] = outputHeight; dims[2] = outputWidth; dims[3] = outputDur; if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] || CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){ Py_XDECREF(%(H)s); %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); if (!(%(H)s)) { PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output"); %(fail)s } } { // extra scope so fail will not cross declarations //#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )#################### const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4]; const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4]; const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3]; const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3]; const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2]; const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2]; const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1]; const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1]; const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0]; const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0]; // Compute H //H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z] bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s); int version = -1; int verbose = 0; bool subsample =(dr>1)||(dc>1)||(dt>1); bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1); bool work_complete = false; if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){ //conv_rows_stack dim3 grid(outputHeight*outputWidth,batchSize*outputChannels); dim3 threads(outputDur); int shared_size=0; conv_rows_stack<<<grid, threads, shared_size>>>( CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s), vidHeight, vidWidth, vidDur, filterHeight, filterWidth, filterDur, outputChannels, inputChannels, dr,dc,dt, vs3,vs2,vs1,vs4,vs0, ws3,ws2,ws1,ws4,ws0); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess == sts) { work_complete = true; if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("INFO: used 'conv_rows_stack' version\\n"); } else { if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts)); PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)", cudaGetErrorString(sts)); %(fail)s } } if(!work_complete){ PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!"); %(fail)s } }}}}}}} //extra scope so error handler jumps don't cross declarations ///////////// < /code generated by GpuConv3D > """ return strutil.renderString(codeSource, locals())
def c_code(self, node, nodename, inputs, outputs, sub): V, W, b, d = inputs fail = sub['fail'] H = outputs[0] codeSource = """ ///////////// < code generated by GpuConv3D > //printf("\t\t\t\tConv3DGPU c code\\n"); //Check dimensionality of inputs if (%(W)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray"); %(fail)s } if (%(V)s->nd != 5) { PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray"); %(fail)s } if (%(b)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray"); %(fail)s } if (%(d)s->nd != 1) { PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); %(fail)s } if (%(d)s->dimensions[0] != 3) { PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]); %(fail)s } { //extra scope so fail doesn't jump over declarations //Read and check sizes of inputs const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0]; const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0]; const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4]; if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels); %(fail)s } { //extra scope so error handler jumps don't cause errors const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1]; const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2]; const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3]; const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1]; const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2]; const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3]; if (vidHeight < filterHeight) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight); %(fail)s } { // extra scope so fail works if (vidWidth < filterWidth) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth); %(fail)s } { // extra scope so fail works if (vidDur < filterDur) { PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur); %(fail)s } { // extra scope so fail works //Read and check stride arguments const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); if (dr <= 0 || dc <= 0 || dt <= 0) { PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt); %(fail)s } { // extra scope so fail works //Make correctly sized output const int outputHeight = int( (vidHeight - filterHeight) / dr )+1; const int outputWidth = int( (vidWidth - filterWidth) / dc )+1; const int outputDur = int( (vidDur - filterDur) / dt ) +1; npy_intp dims[5]; dims[0] = batchSize; dims[4] = outputChannels; dims[1] = outputHeight; dims[2] = outputWidth; dims[3] = outputDur; if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] || CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){ Py_XDECREF(%(H)s); %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); if (!(%(H)s)) { PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output"); %(fail)s } } { // extra scope so fail will not cross declarations //#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )#################### const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4]; const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4]; const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3]; const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3]; const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2]; const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2]; const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1]; const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1]; const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0]; const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0]; // Compute H //H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z] bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s); int version = -1; int verbose = 0; bool subsample =(dr>1)||(dc>1)||(dt>1); bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1); bool work_complete = false; if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){ //conv_rows_stack dim3 grid(outputHeight*outputWidth,batchSize*outputChannels); dim3 threads(outputDur); int shared_size=0; conv_rows_stack<<<grid, threads, shared_size>>>( CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s), vidHeight, vidWidth, vidDur, filterHeight, filterWidth, filterDur, outputChannels, inputChannels, dr,dc,dt, vs3,vs2,vs1,vs4,vs0, ws3,ws2,ws1,ws4,ws0); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess == sts) { work_complete = true; if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("INFO: used 'conv_rows_stack' version\\n"); } else { if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts)); PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)", cudaGetErrorString(sts)); %(fail)s } } if(!work_complete){ PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!"); %(fail)s } }}}}}}} //extra scope so error handler jumps don't cross declarations ///////////// < /code generated by GpuConv3D > """ return strutil.renderString(codeSource,locals())