Exemplo n.º 1
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        W, b, d, H, RShape = inputs
        fail = sub['fail']

        R = outputs[0]

        codeSource = """
            ///////////// < code generated by GpuConvTransp3D >

            //printf("\t\t\t\tGpuConvTransp c code\\n");

            //Check dimensionality of inputs
            if (%(H)s->nd != 5)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
                %(fail)s
            }

            if (%(W)s->nd != 5)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
                %(fail)s
            }

            if (%(b)s->nd != 1)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
                %(fail)s
            }

            if (%(d)s->nd != 1)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
                %(fail)s
            }

            //Read and check stride arguments
            if (%(d)s->dimensions[0] != 3)
            {
                PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
                %(fail)s
            }
{ // for fail
            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
            if (dr <= 0 || dc <= 0 || dt <= 0)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
                %(fail)s
            }


            //Read and check sizes of inputs

{ // for fail
            const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
            const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];

            if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
            {
                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
                %(fail)s
            }
{ // for fail

            const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];

            if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
            {
                PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels );
                %(fail)s
            }
{ // for fail

            const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
            const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
            const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
            const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
            const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
            const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];

            int videoHeight = (outputHeight-1) * dr + filterHeight;
            int videoWidth = (outputWidth-1) * dc + filterWidth;
            int videoDur = (outputDur-1) * dt + filterDur;


            if (%(RShape)s)
            {
                if (%(RShape)s->nd != 1)
                {
                    PyErr_Format(PyExc_ValueError, "RShape must be a vector");
                    %(fail)s
                }

                if (%(RShape)s->dimensions[0] != 3)
                {
                    PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
                    %(fail)s
                }
{ // for fail

                                dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
                                dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
                                dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);

                if (RShape0 != -1)
                {
                    if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
                    {
                        PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
                        %(fail)s
                    }

                    videoHeight = RShape0;
                    videoWidth = RShape1;
                    videoDur = RShape2;
                }
            }

            //Allocate the reconstruction
            npy_intp dims[5];
            dims[0] = batchSize;
            dims[4] = inputChannels;
            dims[1] = videoHeight;
            dims[2] = videoWidth;
            dims[3] = videoDur;

                        if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
                    Py_XDECREF(%(R)s);
               %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
                if (!(%(R)s)) {
                    PyErr_Format(PyExc_MemoryError,"Could not allocate R");
                    %(fail)s;
                }
                        }
            cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);

{ // for fail

bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]);
bool work_complete = false;

const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4];
const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3];
const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2];
const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1];
const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0];


if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){
    //conv_transp_rows_stack
    dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
    dim3 threads(videoDur);

HERE

    int shared_size=0;
        conv_transp_rows_stack<<<grid, threads, shared_size>>>(
        CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s),
        videoHeight, videoWidth, videoDur,
        filterHeight, filterWidth, filterDur,
        outputHeight, outputWidth, outputDur,
        outputChannels, inputChannels,
        dr,dc,dt,
        hs3,hs2,hs1,hs4,hs0,
        ws3,ws2,ws1,ws4,ws0,
        CudaNdarray_HOST_STRIDES(%(b)s)[0]);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
            work_complete = true;
        if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
        }
        else
        {
            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts));
            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
                    cudaGetErrorString(sts));
            %(fail)s
        }


}

if(!work_complete){
            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d",
                         out_contiguous,b_strided,outputDur);
            %(fail)s
}



}}}}}} // for fail
            ///////////// < /code generated by GpuConvTransp3D >
        """
        return strutil.renderString(codeSource, locals())
Exemplo n.º 2
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        V, W, b, d = inputs
        fail = sub['fail']

        H = outputs[0]


        codeSource =  """
            ///////////// < code generated by Conv3D >

            //printf("\t\t\t\tConv3D c code\\n");

            //Check dimensionality of inputs
            if (%(W)s->nd != 5)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor");
                            %(fail)s

            }

            if (%(V)s->nd != 5)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor");
                            %(fail)s
            }

            if (%(b)s->nd != 1)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: b must be a vector.");
                %(fail)s
            }

            if (%(d)s->nd != 1)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: d must be a vector.");
                %(fail)s
            }

            if (%(d)s->dimensions[0] != 3)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)%(d)s->dimensions[0]);
                %(fail)s
            }

            //Read and check sizes of inputs
{ // exta scope so error handler jumps don't cause errors
            const int batchSize = %(V)s->dimensions[0];
            const int outputChannels =  %(W)s->dimensions[0];
            const int inputChannels = %(V)s->dimensions[4];

            if (%(W)s->dimensions[4] != inputChannels)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%ld channel image but the image has %%d channels. Overall shape of input: (%%ld,%%ld,%%ld,%%ld,%%ld)", (long)%(W)s->dimensions[4], inputChannels, (long)%(V)s->dimensions[0], (long)%(V)s->dimensions[1], (long)%(V)s->dimensions[2], (long)%(V)s->dimensions[3], (long)%(V)s->dimensions[4]);
                %(fail)s
            }

            if (%(b)s->dimensions[0] != outputChannels)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%ld channel output image but the output has %%d channels", (long)%(b)s->dimensions[0], outputChannels);
                %(fail)s
            }

{  //extra scope so error handler jumps don't cause errors
            const int filterHeight = %(W)s->dimensions[1];
            const int filterWidth = %(W)s->dimensions[2];
            const int filterDur = %(W)s->dimensions[3];
            const int vidHeight = %(V)s->dimensions[1];
            const int vidWidth = %(V)s->dimensions[2];
            const int vidDur = %(V)s->dimensions[3];\

            if (vidHeight < filterHeight)
            {
                PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
                %(fail)s
            }

{ // extra scope so fail works

            if (vidWidth < filterWidth)
            {
                PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
                %(fail)s
            }

{ // extra scope so fail works

            if (vidDur < filterDur)
            {
                PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
                %(fail)s
            }

{ // extra scope so fail works

            //Read and check stride arguments
            const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0);
            const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1);
            const int dt = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,2);

            if (dr <= 0 || dc <= 0 || dt <= 0)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
                %(fail)s
            }
{ // extra scope so fail works

            //Make correctly sized output
            const long long outputHeight = int( (vidHeight - filterHeight) / dr )+1;
            const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1;
            const long long outputDur = int( (vidDur - filterDur) / dt ) +1;


            npy_intp dims[5];
            dims[0] = batchSize;
            dims[4] = outputChannels;
            dims[1] = outputHeight;
            dims[2] = outputWidth;
            dims[3] = outputDur;



            if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] ||
            %(H)s->dimensions[1]!=dims[1] ||
            %(H)s->dimensions[2]!=dims[2] ||
            %(H)s->dimensions[3]!=dims[3] ||
            %(H)s->dimensions[4]!=dims[4]){
                Py_XDECREF(%(H)s);
                %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
                if (!(%(H)s)) {
                    PyErr_Format(PyExc_MemoryError,"Conv3D: Could not allocate output.");
                    %(fail)s
                }
            }
{ // extra scope so fail works


            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )


            const int ws0 = %(W)s->strides[0];
            const int ws1 = %(W)s->strides[1];
            const int ws2 = %(W)s->strides[2];
            const int vs1 = %(V)s->strides[1];
            const int ws4 = %(W)s->strides[4];
            const int vs4 = %(V)s->strides[4];
            const int ws3 = %(W)s->strides[3];
            const int vs3 = %(V)s->strides[3];
            const int vs2 = %(V)s->strides[2];
            const int bs  = %(b)s->strides[0];
            const int hs4 = %(H)s->strides[4];



            // Compute H
            //H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m]
            //TODO: add special cases
            // ex: filterDur == 1 && batchSize == 1 && dt = 1  (for SFA)
            // ex: inputChannels == 1 """








        #if the data types are not mixed, we can insert special case optimizations based on BLAS
        VV, WV, bv, dv = node.inputs
        HV = node.outputs[0]
        if VV.dtype == WV.dtype and HV.dtype == VV.dtype:
            if VV.dtype == 'float64':
                gemv = 'dgemv_'
            elif VV.dtype == 'float32':
                gemv = 'sgemv_'
            else:
                raise Exception('Unrecognized dtype for convolution '+V.value.dtype)

            codeSource += """
            if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0)))
            {
              //std::cout << "lots of channels special case code" << std::endl;
              #define blas_type dtype_ ## %(V)s
              const blas_type  constant_one = 1.0;
              char N = 'T';
              int ws0e = ws0 / sizeof(ELEM_AT(%(W)s,0));
              int vs4e = vs4 / sizeof(ELEM_AT(%(V)s,4));
              int hs4e = hs4 / sizeof(ELEM_AT(%(H)s,4));

                //special case code for the "lots of channels" case
                //uses a BLAS matrix vector multiply to compute the contribute for
                //all channels of an input pixel to all channels of an output pixel
                //simultaneously
              long long Hpos = 0;
              long long Vpos = 0;
              for (int i = 0; i < batchSize; i++) {
                    long long Hposi = Hpos;
                    long long Vposi = Vpos;


                    for (int r = 0;  r < outputHeight; r++) {
                      long long Hposr = Hpos;
                      long long Vposr = Vpos;
                      for (int c = 0; c < outputWidth; c++) {
                       long long Hposc = Hpos;
                       long long Vposc = Vpos;
                       for (int t = 0; t < outputDur; t++) {
                            long long Hpost = Hpos;
                            long long Vpost = Vpos;
                            //of the loops so far, j should be the innermost, because
                            //each loop through j visits the same elements of V
                            //this implies that the last index of H should be the j index
                            //since V and H should have the same format, this means
                            //z should be the last index in v, and therefore the innermost
                            //of the next set of for loops

                            int Wpos = 0;
                            int bPos = 0;


                            long long Hposj = Hpos;
                            for (int j = 0; j < outputChannels; j++) {
                                // H[i,r,c,t,j] = b[j]
                                ELEM_AT(%(H)s,Hposj) = ELEM_AT(%(b)s,bPos);
                                Hposj += hs4;
                                bPos += bs;
                            }

                            dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos);


                            for (int k =0; k < filterHeight; k++) {
                                  int Wposk = Wpos;
                                  long long Vposk = Vpos;
                                  for (int l = 0; l < filterWidth; l++) {
                                    int Wposl = Wpos;
                                    long long Vposl = Vpos;
                                    for (int m = 0; m < filterDur; m++) {

                                      //H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:])


                                      //note: changing the weights so that outputChannels and inputChannels were the last two rather than
                                      //the first and last elements did not speed this up, even for extremely large input sizes

                                      %(gemv)s(&N, & inputChannels, & outputChannels,
                     &constant_one, & ELEM_AT( %(W)s , Wpos),& ws0e,
                     & ELEM_AT(%(V)s, Vpos),& vs4e, &constant_one,
                     writePos,& hs4e);

                                      Wpos  += ws3;
                                      Vpos  += vs3;
                                    } // close m
                                    Wpos = Wposl + ws2;
                                    Vpos = Vposl + vs2;
                                  } //close l
                                  Wpos = Wposk + %(W)s->strides[1];
                                  Vpos = Vposk + %(V)s->strides[1];
                                } //close k
                             Hpos = Hpost + %(H)s->strides[3];
                             Vpos = Vpost + vs3 * dt;
                         } //close t
                         Hpos = Hposc + %(H)s->strides[2];
                         Vpos = Vposc + vs2 * dc;
                       } //close c
                       Hpos = Hposr + %(H)s->strides[1];
                       Vpos = Vposr + %(V)s->strides[1] * dr;
                   } //closes r
                   Hpos = Hposi + %(H)s->strides[0];
                   Vpos = Vposi + %(V)s->strides[0];
              } //closes i


            } //closes "lots of channels" special case code
            else
"""

        codeSource += """
            {
              //General case code
              //std::cout << "general case code" << std::endl;
              long long Hpos = 0;
              long long Vpos = 0;
              for (int i = 0; i < batchSize; i++) {
                    long long Hposi = Hpos;
                    long long Vposi = Vpos;


                    for (int r = 0;  r < outputHeight; r++) {
                      long long Hposr = Hpos;
                      long long Vposr = Vpos;
                      for (int c = 0; c < outputWidth; c++) {
                       long long Hposc = Hpos;
                       long long Vposc = Vpos;
                       for (int t = 0; t < outputDur; t++) {
                            long long Hpost = Hpos;
                            long long Vpost = Vpos;
                            //of the loops so far, j should be the innermost, because
                            //each loop through j visits the same elements of V
                            //this implies that the last index of H should be the j index
                            //since V and H should have the same format, this means
                            //z should be the last index in v, and therefore the innermost
                            //of the next set of for loops

                            int Wpos = 0;
                            int bPos = 0;


                            for (int j = 0; j < outputChannels; j++) {


                                long long Hposj = Hpos;
                                long long Vposj = Vpos;
                                int Wposj = Wpos;

                                // H[i,r,c,t,j] = b[j]

                                dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);


                                writePos = ELEM_AT(%(b)s,bPos);


                                for (int k =0; k < filterHeight; k++) {
                                  int Wposk = Wpos;
                                  long long Vposk = Vpos;
                                  for (int l = 0; l < filterWidth; l++) {
                                    int Wposl = Wpos;
                                    long long Vposl = Vpos;
                                    for (int m = 0; m < filterDur; m++) {
                                      int Wposm = Wpos;
                                      long long Vposm = Vpos;
                                      for (int z = 0; z < inputChannels; z++) {
                                        //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]


                                        writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);

                                        Wpos += ws4;
                                        Vpos += vs4;
                                      } // close z
                                      Wpos = Wposm + ws3;
                                      Vpos = Vposm + vs3;
                                    } // close m
                                    Wpos = Wposl + ws2;
                                    Vpos = Vposl + vs2;
                                  } //close l
                                  Wpos = Wposk + %(W)s->strides[1];
                                  Vpos = Vposk + %(V)s->strides[1];
                                } //close k


                              bPos += bs;
                              Wpos = Wposj + ws0;
                              Hpos = Hposj +  hs4;
                              Vpos = Vposj;
                              //std::cout << "incremented Wpos by " << ws0 << std::endl;
                              //std::cout << "incremented Hpos by " << hs4 << std::endl;
                             } //close j
                             Hpos = Hpost + %(H)s->strides[3];
                             Vpos = Vpost + vs3 * dt;
                         } //close t
                         Hpos = Hposc + %(H)s->strides[2];
                         Vpos = Vposc + vs2 * dc;
                       } //close c
                       Hpos = Hposr + %(H)s->strides[1];
                       Vpos = Vposr + %(V)s->strides[1] * dr;
                   } //closes r
                   Hpos = Hposi + %(H)s->strides[0];
                   Vpos = Vposi + %(V)s->strides[0];
              } //closes i
            } //closes general case code
}}}}}}} //extra scope so error handler jumps don't cross declarations
            ///////////// < /code generated by Conv3D >
        """

        return strutil.renderString(codeSource,locals())
Exemplo n.º 3
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        V, d, WShape, dCdH = inputs
        fail = sub['fail']

        dCdW = outputs[0]

        codeSource = """
            ///////////// < code generated by ConvGradW3D >

            //printf("\t\t\t\tConvGradW3D c code\\n");

            //Check dimensionality of inputs
            if (PyArray_NDIM(%(dCdH)s) != 5)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor");
                            %(fail)s
            }

            if (PyArray_NDIM(%(V)s) != 5)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor");
                %(fail)s
            }

            if (PyArray_NDIM(%(WShape)s) != 1)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be a vector.");
                %(fail)s
            }

            if (PyArray_NDIM(%(d)s) != 1)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: d must be a vector.");
                %(fail)s
            }

            if (PyArray_DIMS(%(d)s)[0] != 3)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0]);
                %(fail)s
            }
{ //extra scope so that fail will not jump over declarations

            //Read and check sizes of inputs
            const int batchSize = PyArray_DIMS(%(V)s)[0];
            if (PyArray_DIMS(%(WShape)s)[0] != 5)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must specify a 5D shape");
                %(fail)s
            }
            if (!PyArray_ISCONTIGUOUS(%(WShape)s))
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
                %(fail)s
            }

{ //extra scope so that fail will not jump over declarations
            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) PyArray_DATA(%(WShape)s);
            const int outputChannels =  WShape[0];
            const int inputChannels = PyArray_DIMS(%(V)s)[4];
            if (WShape[4] != inputChannels)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels);
                %(fail)s

            }
{ //extra scope so fail works
            const int filterHeight = WShape[1];
            const int filterWidth = WShape[2];
            const int filterDur = WShape[3];
            const int vidHeight = PyArray_DIMS(%(V)s)[1];
            const int vidWidth = PyArray_DIMS(%(V)s)[2];
            const int vidDur = PyArray_DIMS(%(V)s)[3];
            if (vidHeight < filterHeight)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
                %(fail)s
            }
            if (vidWidth < filterWidth)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a width of %%i but V is only %%i pixels tall",filterWidth,vidWidth);
                %(fail)s
            }
            if (vidDur < filterDur)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
                %(fail)s
            }

{ // extra scope so fail works
            //Read and check stride arguments
            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
            if (dr <= 0 || dc <= 0 || dt <= 0)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: Strides should all be positive but they are %%i, %%i, %%i",dr,dc,dt);
                %(fail)s
            }

{ // extra scope so fail works
            //Compute correct sized of output
            const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
            const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
            const int outputDur = int( (vidDur - filterDur) / dt ) +1;



            if (PyArray_DIMS(%(dCdH)s)[0] != batchSize ||
                PyArray_DIMS(%(dCdH)s)[4] != outputChannels ||
                PyArray_DIMS(%(dCdH)s)[1] != outputHeight ||
                PyArray_DIMS(%(dCdH)s)[2] != outputWidth ||
                PyArray_DIMS(%(dCdH)s)[3] != outputDur)
            {
                PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%li,%%li,%%li,%%li,%%li)", batchSize,  outputHeight, outputWidth, outputDur, outputChannels, (long)PyArray_DIMS(%(dCdH)s)[0], (long)PyArray_DIMS(%(dCdH)s)[1], (long)PyArray_DIMS(%(dCdH)s)[2], (long)PyArray_DIMS(%(dCdH)s)[3], (long)PyArray_DIMS(%(dCdH)s)[4]);
                %(fail)s
            }
{ // extra scope for fail

            npy_intp dims[5];
            dims[0] = outputChannels;
            dims[4] = inputChannels;
            dims[1] = filterHeight;
            dims[2] = filterWidth;
            dims[3] = filterDur;

            if(!(%(dCdW)s)  || PyArray_DIMS(%(dCdW)s)[0]!=dims[0] ||
                  PyArray_DIMS(%(dCdW)s)[1]!=dims[1] ||
                  PyArray_DIMS(%(dCdW)s)[2]!=dims[2] ||
                  PyArray_DIMS(%(dCdW)s)[3]!=dims[3] ||
                  PyArray_DIMS(%(dCdW)s)[4]!=dims[4] ){
               Py_XDECREF(%(dCdW)s);
               %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num);

               if (!(%(dCdW)s)) {
                  PyErr_Format(PyExc_MemoryError,"ConvGrad3D: Could not allocate dCdW");
                %(fail)s
               }
            }
{ //extra scope so fail works

            #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_DATA(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] )

            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )

            const int dhs3 = PyArray_STRIDES(%(dCdH)s)[3];
            const int dtvs3 = dt * PyArray_STRIDES(%(V)s)[3];

            // Compute dCdW
            //TODO-- see if this can be made faster by using ELEM_AT instead of ELEM5
            // dCdW[j,k,l,m,z] = sum_i sum_p sum_q sum_r dCdH[i,p,q,r,j]  *  V[i,dr*p+k,dc*q+l,dt*r+m,z]
            for (int j = 0; j < outputChannels; j++) {
                for (int z = 0; z < inputChannels; z++) {
                    for (int k = 0; k < filterHeight; k++) {
                        for (int l = 0; l < filterWidth; l++) {
                            for (int m = 0; m < filterDur; m++) {

                                //printf("writePos %%i %%i %%i %%i %%i \\n",j,k,l,m,z);

                                dtype_%(dCdW)s & writePos =  ELEM5(%(dCdW)s, j,k,l,m,z);
                                writePos = 0;
                                for (int i = 0; i < batchSize; i++) {
                                    for (int p = 0; p < outputHeight; p++) {
                                        for (int q = 0; q < outputWidth; q++) {
                                            int Hpos = i * PyArray_STRIDES(%(dCdH)s)[0] + j * PyArray_STRIDES(%(dCdH)s)[4] + p * PyArray_STRIDES(%(dCdH)s)[1] + q * PyArray_STRIDES(%(dCdH)s)[2] ;
                                            int Vpos = i * PyArray_STRIDES(%(V)s)[0] + z * PyArray_STRIDES(%(V)s)[4] +  (dr * p+k) * PyArray_STRIDES(%(V)s)[1] +  (dc*q+l) * PyArray_STRIDES(%(V)s)[2] + m * PyArray_STRIDES(%(V)s)[3];

                                            for (int r = 0; r < outputDur; r++) {
                                                writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
                                                //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
                                                Hpos += dhs3;
                                                Vpos += dtvs3;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

}}}}}}} // extra scope for fail
            ///////////// < /code generated by ConvGradW3D >
        """

        return strutil.renderString(codeSource, locals())
Exemplo n.º 4
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        W, b, d, H, RShape = inputs
        fail = sub['fail']

        R = outputs[0]

        codeSource = """
                    ///////////// < code generated by ConvTransp3D >

                    //printf("\t\t\t\tConvTransp3D c code\\n");

                    //Check dimensionality of inputs
                    if (%(H)s->nd != 5)
                    {
                        PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
                        %(fail)s
                    }

                    if (%(W)s->nd != 5)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor");
                %(fail)s
                    }

                    if (%(b)s->nd != 1)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector");
                         %(fail)s
                    }

                    if (%(d)s->nd != 1)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector");
                         %(fail)s
                    }

                    //Read and check stride arguments
                    if (%(d)s->dimensions[0] != 3)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", (long)%(d)s->dimensions[0] );
                         %(fail)s
                    }

                    { // for fail 1
                         int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
                         int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
                         int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);

                         if (dr <= 0 || dc <= 0 || dt <= 0)
                         {
                             PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
                             %(fail)s
                          }


                         //Read and check sizes of inputs

                        { // for fail 2
                            const int batchSize = %(H)s->dimensions[0];
                            const int outputChannels =  %(W)s->dimensions[0];

                            if (%(H)s->dimensions[4] != outputChannels)
                            {
                                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li, %%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)", outputChannels, (long)%(H)s->dimensions[4], (long)%(W)s->dimensions[0], (long)%(W)s->dimensions[1], (long)%(W)s->dimensions[2], (long)%(W)s->dimensions[3], (long)%(W)s->dimensions[4], (long)%(H)s->dimensions[0], (long)%(H)s->dimensions[1], (long)%(H)s->dimensions[2], (long)%(H)s->dimensions[3], (long)%(H)s->dimensions[4]);
                                %(fail)s
                            }

                            { // for fail 3

                                const int inputChannels = %(W)s->dimensions[4];

                                if (%(b)s->dimensions[0] != inputChannels)
                                {
                                    PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%li channel image but the image has %%i channels", (long)%(b)s->dimensions[0], inputChannels );
                                    %(fail)s
                                }

                                { // for fail 4

                                const int filterHeight = %(W)s->dimensions[1];
                                const int filterWidth = %(W)s->dimensions[2];
                                const int filterDur = %(W)s->dimensions[3];
                                const int outputHeight = %(H)s->dimensions[1];
                                const int outputWidth = %(H)s->dimensions[2];
                                const int outputDur = %(H)s->dimensions[3];

                                int videoHeight = (outputHeight-1) * dr + filterHeight;
                                int videoWidth = (outputWidth-1) * dc + filterWidth;
                                int videoDur = (outputDur-1) * dt + filterDur;

                                if (%(RShape)s)
                                {
                                    if (%(RShape)s->nd != 1)
                                    {
                                        PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector");
                                        %(fail)s
                                    }

                                    if (%(RShape)s->dimensions[0] != 3)
                                    {
                                        PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
                                        %(fail)s
                                    }

                                    dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
                                    dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
                                    dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);

                                    if (RShape0 != -1)
                                    {
                                        if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
                                        {
                                            PyErr_Format(PyExc_ValueError, "Reconstruction must have physical shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]\\n",videoHeight,videoWidth,videoDur,(int) RShape0,(int) RShape1,(int) RShape2);
                                            %(fail)s
                                        }

                                        videoHeight = RShape0;
                                        videoWidth = RShape1;
                                        videoDur = RShape2;
                                   }
                               } //closes if RShape

                               { // for fail 5

                                   //Allocate the reconstruction
                                   npy_intp dims[5];
                                   dims[0] = batchSize;
                                   dims[4] = inputChannels;
                                   dims[1] = videoHeight;
                                   dims[2] = videoWidth;
                                   dims[3] = videoDur;

                                   if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] ||
                                    %(R)s->dimensions[1]!=dims[1] ||
                                    %(R)s->dimensions[2]!=dims[2] ||
                                    %(R)s->dimensions[3]!=dims[3] ||
                                    %(R)s->dimensions[4]!=dims[4])
                                   {
                                       Py_XDECREF(%(R)s);
                                       %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(H)s->descr->type_num);
                                       if (!(%(R)s)) {
                                           PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R");
                                           %(fail)s
                                       }
                                   }

                                   { // for fail 6

                                       #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
                                       #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )



                                       dtype_%(b)s * b = (dtype_%(b)s *) %(b)s->data;

                                       int rs4 = %(R)s->strides[4];
                                       int ws0 = %(W)s->strides[0];
                                       int ws4 = %(W)s->strides[4];
                                       int hs4 = %(H)s->strides[4];

                                       // Compute R
                                       // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k]

                                       for (int i = 0; i < batchSize; i++) {
                                        for (int r = 0; r < videoHeight; r++) {
                                         const int frc = (int)std::max(0.0f, ceilf(float(r-filterHeight+1)/float(dr)));
                                         for (int c = 0; c < videoWidth; c++) {
                                          const int fcc = (int)std::max(0.0f, ceilf(float(c-filterWidth +1)/float(dc)));
                                          for (int t = 0; t < videoDur; t++) {
                                           const int ftc = (int)std::max(0.0f, ceilf(float(t-filterDur +1)  /float(dt)));

                                           long long Rpost = i * %(R)s->strides[0] + r * %(R)s->strides[1] + c * %(R)s->strides[2] + t * %(R)s->strides[3];

                                           long long Rpos = Rpost;
                                           for (int j = 0; j < inputChannels; j++)
                                           {
                                            //ELEM5(%(R)s, i,r,c,t,j) = b[j];
                                            ELEM_AT(%(R)s,Rpos) = b[j];
                                            Rpos += rs4;
                                           }


                                           for (int rc = frc; rc < outputHeight; rc++) {
                                            const int rk = r - rc * dr;
                                            if (rk < 0) break;

                                            for (int cc = fcc; cc < outputWidth; cc++) {
                                             const int ck = c - cc * dc;
                                             if (ck < 0) break;

                                             for (int tc = ftc; tc < outputDur; tc++)
                                             {
                                              const int tk = t - tc * dt;
                                              if (tk < 0) break;

                                              int Wpos = rk * %(W)s->strides[1] +  ck * %(W)s->strides[2] + tk * %(W)s->strides[3];
                                              int Hpostc = i * %(H)s->strides[0] +      rc * %(H)s->strides[1] +  cc * %(H)s->strides[2] + tc * %(H)s->strides[3];
                                              Rpos = Rpost;
                                              for (int j = 0; j < inputChannels; j++)
                                              {
                                               int Wposj = Wpos;
                                               dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);

                                               int Hpos = Hpostc;

                                               for (int k = 0; k < outputChannels; k++) {
                                                //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
                                                //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
                                                //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);

                                                writePos  += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);

                                                Wpos += ws0;
                                                Hpos += hs4;

                                               } //close the k loop
                                               Rpos += rs4;
                                               Wpos = Wposj +  ws4;
                                              } //close the j loop
                                             } // close the tc loop
                                            } //cc
                                           } //rc
                                          } //t
                                         } //c
                                        } //r
                                       } //i
                                   } //for fail 6
                               } //for fail 5
                           } //for fail 4
                       } //for fail 3
                   } //for fail 2
               } // for fail 1
               ///////////// < /code generated by ConvTransp3D >
                     """

        return strutil.renderString(codeSource,locals())
Exemplo n.º 5
0
    def c_code(self, node, name, inputs, outputs, sub):
        partial_sum = self.partial_sum if self.partial_sum is not None else 0
        images, hid_grads = inputs
        weights_grads, = outputs
        fail = sub['fail']
        pad = self.pad

        # convFilterActs will multiply targets by scaleTargets
        # then add scaleOutput * (the convolution value)
        # We could make use of this to implement an inplace
        # addconv op but for this op we just want to compute
        # the convolution so we set them to 0 and 1 respectively
        # Note: there is another version of convFilterActs that
        # does not take these arguments, but it is just a wrapper
        # around the version that does take them, so we save
        # a function call by using the version that we use.
        basic_setup = """
        #define scaleTargets 0
        #define scaleOutput 1
        """

        if self.dense_connectivity:
            basic_setup += """
            #define numGroups 1
            """

        basic_setup += """
        #define paddingStart (-%(pad)d)
        const int *hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s);
        const int hidGradsSizeY = hid_grads_dims[1];
        const int hidGradsSizeX = hid_grads_dims[2];
        const int numModules = hidGradsSizeX * hidGradsSizeY;
        int partialSum = %(partial_sum)d > 0 ? %(partial_sum)d : numModules;
        if (numModules %% partialSum > 0) {
            PyErr_Format(PyExc_ValueError,
                "partialSum must divide numModules, but partialSum=%%d and "
                "numModules=%%d", partialSum, numModules);
            %(fail)s;
        }
        """

        if self.stride != 1:
            raise UnimplementedError()
        else:
            basic_setup += """
            #define moduleStride 1
        """
        if self.copy_non_contiguous:
            raise UnimplementedError()
        else:
            basic_setup += "#define WEIGHTACTS_COPY_NON_CONTIGUOUS 0\n"

        # The amount of braces that must be closed at the end
        num_braces = 0

        # Convert images int nv_images, an NVMatrix, for compatibility
        # with the cuda-convnet functions
        setup_nv_images = self._argument_contiguity_check("images") + """
        if (%(images)s->nd != 4)
        {
            PyErr_Format(PyExc_ValueError,
                "images must have nd=4, got nd=%%i", %(images)s->nd);
            %(fail)s;
        }
        { //setup_nv_images brace 1
        const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s);
        const int img_channels = images_dims[0];
        if (img_channels > 3 && img_channels %% 4 != 0)
        {
            PyErr_Format(PyExc_ValueError,
                "images must have 3 or fewer channels, or have a multiple of 4 channels, got %%i",
                img_channels);
            %(fail)s;
        }

        { //setup_nv_images brace 2
        const int * hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s);
        const int imgSizeY = images_dims[1];
        const int imgSizeX = images_dims[2];
        const int batch_size = images_dims[3];
        NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "weight_acts: nv_images");
        """
        num_braces += 2

        # Convert hid_grads int nv_hid_grads, an NVMatrix, for compatibility
        # with the cuda-convnet functions
        setup_nv_hid_grads = self._argument_contiguity_check("hid_grads") + """
        if (%(hid_grads)s->nd != 4)
        {
            PyErr_Format(PyExc_ValueError,
                "hid_grads must have nd=4, got nd=%%i", %(hid_grads)s->nd);
            %(fail)s;
        }

        { //setup_nv_hid_grads brace 1
        const int numFilters = hid_grads_dims[0];
        const int batch_size = hid_grads_dims[3];
        NVMatrix nv_hid_grads(%(hid_grads)s, numFilters * hidGradsSizeY *
                                           hidGradsSizeX, batch_size, "weight_acts:nv_hid_grads");
        """
        num_braces += 1


        setup_nv_weights_grads = """
        int filters_dims[4];
        // filters:  (input channels, filter rows, filter cols, output channels)
        filters_dims[0] = img_channels;
        filters_dims[1] = imgSizeY - hidGradsSizeY + 1 - 2 * paddingStart;
        filters_dims[2] = imgSizeX - hidGradsSizeX + 1 - 2 * paddingStart;
        assert(filters_dims[1] == filters_dims[2]); // only square kernels are supported
        filters_dims[3] = numFilters;
        const int filterSize = filters_dims[1];
        int partialsum_storage_dims[5];
        for (int i = 1; i < 5; i++)
        {
            partialsum_storage_dims[i] = filters_dims[i - 1];
        }
        partialsum_storage_dims[0] = numModules / partialSum;
        CudaNdarray *partialsum_storage = NULL;
        if (partialSum != numModules &&
            CudaNdarray_prep_output(&partialsum_storage, 5,
                                    partialsum_storage_dims))
        {
            %(fail)s;
        }

        for (int i = 0; i < 4; i++)
        {
            if (filters_dims[i] <= 0)
            {
                printf("filters_dims[%%d] = %%d\\n", i, filters_dims[i]);
                assert(false);
            }
        }
        if (CudaNdarray_prep_output(& %(weights_grads)s, 4, filters_dims))
        {
            Py_DECREF(partialsum_storage);
            %(fail)s;
        }

        { // setup_nv_weights_grad brace # 1

        NVMatrix nv_weights_grads(%(weights_grads)s, filters_dims[0] * filterSize * filterSize, numFilters,
        "weight_acts:nv_weights_grads");

        """

        num_braces += 1

        # note: imgSizeX is not specified here, it is computed internally
        # (in _filterActsSparse) by the lines:
        # int imgPixels = images.getNumRows() / numImgColors;
        # int imgSizeX = imgPixels / imgSizeY;
        #
        # note: numFilters is not specified here. it is determined by
        # nv_filters.getNumCols()
        #
        # note: the size of the filters is determined by dividing
        # nv_filters.getNumRows() by numFilterColors
        #
        run_kernel = """

        if (partialSum == numModules)
            _weightActs(nv_images, nv_hid_grads, nv_weights_grads,
                        imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize,
                        paddingStart, moduleStride, img_channels, numGroups,
                        partialSum, 0, 1);
        else {
            NVMatrix nv_partialsum(partialsum_storage, (numModules / partialSum) *
                     filters_dims[0] * filterSize * filterSize, numFilters,
                     "weight_acts: nv_partialsum");
            _weightActs(nv_images, nv_hid_grads, nv_partialsum,
                        imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize,
                        paddingStart, moduleStride, img_channels, numGroups,
                        partialSum, 0, 1);
            nv_partialsum.reshape((numModules / partialSum), filters_dims[0] * filterSize * filterSize * numFilters);

            // sum out axis 0 of nv_partialsum
            #define AXIS 0
            // scale the contents of nv_weights_grads by 0
            // i.e., clear out its pre-existing content
            #define SCALE_THIS 0
            // scale the new sum by 1, i.e., don't do any scaling
            #define SCALE_SUM 1
            nv_weights_grads.addSum(nv_partialsum, AXIS, SCALE_THIS, SCALE_SUM);

            Py_DECREF(partialsum_storage);
        }
        """

        braces = '}' * num_braces

        rval = (basic_setup +
                setup_nv_images +
                setup_nv_hid_grads +
                setup_nv_weights_grads +
                run_kernel +
                braces)

        rval = renderString(rval, locals())

        return rval
Exemplo n.º 6
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        W, b, d, H, RShape = inputs
        fail = sub['fail']

        R = outputs[0]

        codeSource = """
                    ///////////// < code generated by ConvTransp3D >

                    //printf("\t\t\t\tConvTransp3D c code\\n");

                    //Check dimensionality of inputs
                    if (PyArray_NDIM(%(H)s) != 5)
                    {
                        PyErr_Format(PyExc_ValueError,
                                     "H must be a 5-D tensor but it is %%i-D",
                                     PyArray_NDIM(%(H)s));
                        %(fail)s
                    }

                    if (PyArray_NDIM(%(W)s) != 5)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor");
                %(fail)s
                    }

                    if (PyArray_NDIM(%(b)s) != 1)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector");
                         %(fail)s
                    }

                    if (PyArray_NDIM(%(d)s) != 1)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector");
                         %(fail)s
                    }

                    //Read and check stride arguments
                    if (PyArray_DIMS(%(d)s)[0] != 3)
                    {
                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0] );
                         %(fail)s
                    }

                    { // for fail 1
                         int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
                         int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
                         int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);

                         if (dr <= 0 || dc <= 0 || dt <= 0)
                         {
                             PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
                             %(fail)s
                          }


                         //Read and check sizes of inputs

                        { // for fail 2
                            const int batchSize = PyArray_DIMS(%(H)s)[0];
                            const int outputChannels =  PyArray_DIMS(%(W)s)[0];

                            if (PyArray_DIMS(%(H)s)[4] != outputChannels)
                            {
                                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li, %%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)", outputChannels, (long)PyArray_DIMS(%(H)s)[4], (long)PyArray_DIMS(%(W)s)[0], (long)PyArray_DIMS(%(W)s)[1], (long)PyArray_DIMS(%(W)s)[2], (long)PyArray_DIMS(%(W)s)[3], (long)PyArray_DIMS(%(W)s)[4], (long)PyArray_DIMS(%(H)s)[0], (long)PyArray_DIMS(%(H)s)[1], (long)PyArray_DIMS(%(H)s)[2], (long)PyArray_DIMS(%(H)s)[3], (long)PyArray_DIMS(%(H)s)[4]);
                                %(fail)s
                            }

                            { // for fail 3

                                const int inputChannels = PyArray_DIMS(%(W)s)[4];

                                if (PyArray_DIMS(%(b)s)[0] != inputChannels)
                                {
                                    PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%li channel image but the image has %%i channels", (long)PyArray_DIMS(%(b)s)[0], inputChannels );
                                    %(fail)s
                                }

                                { // for fail 4

                                const int filterHeight = PyArray_DIMS(%(W)s)[1];
                                const int filterWidth = PyArray_DIMS(%(W)s)[2];
                                const int filterDur = PyArray_DIMS(%(W)s)[3];
                                const int outputHeight = PyArray_DIMS(%(H)s)[1];
                                const int outputWidth = PyArray_DIMS(%(H)s)[2];
                                const int outputDur = PyArray_DIMS(%(H)s)[3];

                                int videoHeight = (outputHeight-1) * dr + filterHeight;
                                int videoWidth = (outputWidth-1) * dc + filterWidth;
                                int videoDur = (outputDur-1) * dt + filterDur;

                                if (%(RShape)s)
                                {
                                    if (PyArray_NDIM(%(RShape)s) != 1)
                                    {
                                        PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector");
                                        %(fail)s
                                    }

                                    if (PyArray_DIMS(%(RShape)s)[0] != 3)
                                    {
                                        PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
                                        %(fail)s
                                    }

                                    dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
                                    dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
                                    dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);

                                    if (RShape0 != -1)
                                    {
                                        if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
                                        {
                                            PyErr_Format(PyExc_ValueError, "Reconstruction must have physical shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]\\n",videoHeight,videoWidth,videoDur,(int) RShape0,(int) RShape1,(int) RShape2);
                                            %(fail)s
                                        }

                                        videoHeight = RShape0;
                                        videoWidth = RShape1;
                                        videoDur = RShape2;
                                   }
                               } //closes if RShape

                               { // for fail 5

                                   //Allocate the reconstruction
                                   npy_intp dims[5];
                                   dims[0] = batchSize;
                                   dims[4] = inputChannels;
                                   dims[1] = videoHeight;
                                   dims[2] = videoWidth;
                                   dims[3] = videoDur;

                                   if(!(%(R)s) || PyArray_DIMS(%(R)s)[0]!=dims[0] ||
                                    PyArray_DIMS(%(R)s)[1]!=dims[1] ||
                                    PyArray_DIMS(%(R)s)[2]!=dims[2] ||
                                    PyArray_DIMS(%(R)s)[3]!=dims[3] ||
                                    PyArray_DIMS(%(R)s)[4]!=dims[4])
                                   {
                                       Py_XDECREF(%(R)s);
                                       %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(H)s)->type_num);
                                       if (!(%(R)s)) {
                                           PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R");
                                           %(fail)s
                                       }
                                   }

                                   { // for fail 6

                                       #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_DATA(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] )
                                       #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )



                                       dtype_%(b)s * b = (dtype_%(b)s *) PyArray_DATA(%(b)s);

                                       int rs4 = PyArray_STRIDES(%(R)s)[4];
                                       int ws0 = PyArray_STRIDES(%(W)s)[0];
                                       int ws4 = PyArray_STRIDES(%(W)s)[4];
                                       int hs4 = PyArray_STRIDES(%(H)s)[4];

                                       // Compute R
                                       // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k]

                                       for (int i = 0; i < batchSize; i++) {
                                        for (int r = 0; r < videoHeight; r++) {
                                         const int frc = (int)std::max(0.0f, ceilf(float(r-filterHeight+1)/float(dr)));
                                         for (int c = 0; c < videoWidth; c++) {
                                          const int fcc = (int)std::max(0.0f, ceilf(float(c-filterWidth +1)/float(dc)));
                                          for (int t = 0; t < videoDur; t++) {
                                           const int ftc = (int)std::max(0.0f, ceilf(float(t-filterDur +1)  /float(dt)));

                                           long long Rpost = i * PyArray_STRIDES(%(R)s)[0] + r * PyArray_STRIDES(%(R)s)[1] + c * PyArray_STRIDES(%(R)s)[2] + t * PyArray_STRIDES(%(R)s)[3];

                                           long long Rpos = Rpost;
                                           for (int j = 0; j < inputChannels; j++)
                                           {
                                            //ELEM5(%(R)s, i,r,c,t,j) = b[j];
                                            ELEM_AT(%(R)s,Rpos) = b[j];
                                            Rpos += rs4;
                                           }


                                           for (int rc = frc; rc < outputHeight; rc++) {
                                            const int rk = r - rc * dr;
                                            if (rk < 0) break;

                                            for (int cc = fcc; cc < outputWidth; cc++) {
                                             const int ck = c - cc * dc;
                                             if (ck < 0) break;

                                             for (int tc = ftc; tc < outputDur; tc++)
                                             {
                                              const int tk = t - tc * dt;
                                              if (tk < 0) break;

                                              int Wpos = rk * PyArray_STRIDES(%(W)s)[1] +  ck * PyArray_STRIDES(%(W)s)[2] + tk * PyArray_STRIDES(%(W)s)[3];
                                              int Hpostc = i * PyArray_STRIDES(%(H)s)[0] +      rc * PyArray_STRIDES(%(H)s)[1] +  cc * PyArray_STRIDES(%(H)s)[2] + tc * PyArray_STRIDES(%(H)s)[3];
                                              Rpos = Rpost;
                                              for (int j = 0; j < inputChannels; j++)
                                              {
                                               int Wposj = Wpos;
                                               dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);

                                               int Hpos = Hpostc;

                                               for (int k = 0; k < outputChannels; k++) {
                                                //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
                                                //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
                                                //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);

                                                writePos  += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);

                                                Wpos += ws0;
                                                Hpos += hs4;

                                               } //close the k loop
                                               Rpos += rs4;
                                               Wpos = Wposj +  ws4;
                                              } //close the j loop
                                             } // close the tc loop
                                            } //cc
                                           } //rc
                                          } //t
                                         } //c
                                        } //r
                                       } //i
                                   } //for fail 6
                               } //for fail 5
                           } //for fail 4
                       } //for fail 3
                   } //for fail 2
               } // for fail 1
               ///////////// < /code generated by ConvTransp3D >
                     """

        return strutil.renderString(codeSource, locals())
Exemplo n.º 7
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        V, d, WShape, dCdH = inputs
        fail = sub['fail']

        dCdW = outputs[0]

        codeSource = """
            ///////////// < code generated by ConvGradW3D >

            //printf("\t\t\t\tConvGradW3D c code\\n");

            //Check dimensionality of inputs
            if (PyArray_NDIM(%(dCdH)s) != 5)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor");
                            %(fail)s
            }

            if (PyArray_NDIM(%(V)s) != 5)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor");
                %(fail)s
            }

            if (PyArray_NDIM(%(WShape)s) != 1)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be a vector.");
                %(fail)s
            }

            if (PyArray_NDIM(%(d)s) != 1)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: d must be a vector.");
                %(fail)s
            }

            if (PyArray_DIMS(%(d)s)[0] != 3)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0]);
                %(fail)s
            }
{ //extra scope so that fail will not jump over declarations

            //Read and check sizes of inputs
            const int batchSize = PyArray_DIMS(%(V)s)[0];
            if (PyArray_DIMS(%(WShape)s)[0] != 5)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must specify a 5D shape");
                %(fail)s
            }
            if (!PyArray_ISCONTIGUOUS(%(WShape)s))
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
                %(fail)s
            }

{ //extra scope so that fail will not jump over declarations
            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) PyArray_DATA(%(WShape)s);
            const int outputChannels =  WShape[0];
            const int inputChannels = PyArray_DIMS(%(V)s)[4];
            if (WShape[4] != inputChannels)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels);
                %(fail)s

            }
{ //extra scope so fail works
            const int filterHeight = WShape[1];
            const int filterWidth = WShape[2];
            const int filterDur = WShape[3];
            const int vidHeight = PyArray_DIMS(%(V)s)[1];
            const int vidWidth = PyArray_DIMS(%(V)s)[2];
            const int vidDur = PyArray_DIMS(%(V)s)[3];
            if (vidHeight < filterHeight)
            {
                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
                %(fail)s
            }
            if (vidWidth < filterWidth)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a width of %%i but V is only %%i pixels tall",filterWidth,vidWidth);
                %(fail)s
            }
            if (vidDur < filterDur)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
                %(fail)s
            }

{ // extra scope so fail works
            //Read and check stride arguments
            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
            if (dr <= 0 || dc <= 0 || dt <= 0)
            {
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: Strides should all be positive but they are %%i, %%i, %%i",dr,dc,dt);
                %(fail)s
            }

{ // extra scope so fail works
            //Compute correct sized of output
            const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
            const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
            const int outputDur = int( (vidDur - filterDur) / dt ) +1;



            if (PyArray_DIMS(%(dCdH)s)[0] != batchSize ||
                PyArray_DIMS(%(dCdH)s)[4] != outputChannels ||
                PyArray_DIMS(%(dCdH)s)[1] != outputHeight ||
                PyArray_DIMS(%(dCdH)s)[2] != outputWidth ||
                PyArray_DIMS(%(dCdH)s)[3] != outputDur)
            {
                PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%li,%%li,%%li,%%li,%%li)", batchSize,  outputHeight, outputWidth, outputDur, outputChannels, (long)PyArray_DIMS(%(dCdH)s)[0], (long)PyArray_DIMS(%(dCdH)s)[1], (long)PyArray_DIMS(%(dCdH)s)[2], (long)PyArray_DIMS(%(dCdH)s)[3], (long)PyArray_DIMS(%(dCdH)s)[4]);
                %(fail)s
            }
{ // extra scope for fail

            npy_intp dims[5];
            dims[0] = outputChannels;
            dims[4] = inputChannels;
            dims[1] = filterHeight;
            dims[2] = filterWidth;
            dims[3] = filterDur;

            if(!(%(dCdW)s)  || PyArray_DIMS(%(dCdW)s)[0]!=dims[0] ||
                  PyArray_DIMS(%(dCdW)s)[1]!=dims[1] ||
                  PyArray_DIMS(%(dCdW)s)[2]!=dims[2] ||
                  PyArray_DIMS(%(dCdW)s)[3]!=dims[3] ||
                  PyArray_DIMS(%(dCdW)s)[4]!=dims[4] ){
               Py_XDECREF(%(dCdW)s);
               %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num);

               if (!(%(dCdW)s)) {
                  PyErr_Format(PyExc_MemoryError,"ConvGrad3D: Could not allocate dCdW");
                %(fail)s
               }
            }
{ //extra scope so fail works

            #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_DATA(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] )

            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )

            const int dhs3 = PyArray_STRIDES(%(dCdH)s)[3];
            const int dtvs3 = dt * PyArray_STRIDES(%(V)s)[3];

            // Compute dCdW
            //TODO-- see if this can be made faster by using ELEM_AT instead of ELEM5
            // dCdW[j,k,l,m,z] = sum_i sum_p sum_q sum_r dCdH[i,p,q,r,j]  *  V[i,dr*p+k,dc*q+l,dt*r+m,z]
            for (int j = 0; j < outputChannels; j++) {
                for (int z = 0; z < inputChannels; z++) {
                    for (int k = 0; k < filterHeight; k++) {
                        for (int l = 0; l < filterWidth; l++) {
                            for (int m = 0; m < filterDur; m++) {

                                //printf("writePos %%i %%i %%i %%i %%i \\n",j,k,l,m,z);

                                dtype_%(dCdW)s & writePos =  ELEM5(%(dCdW)s, j,k,l,m,z);
                                writePos = 0;
                                for (int i = 0; i < batchSize; i++) {
                                    for (int p = 0; p < outputHeight; p++) {
                                        for (int q = 0; q < outputWidth; q++) {
                                            int Hpos = i * PyArray_STRIDES(%(dCdH)s)[0] + j * PyArray_STRIDES(%(dCdH)s)[4] + p * PyArray_STRIDES(%(dCdH)s)[1] + q * PyArray_STRIDES(%(dCdH)s)[2] ;
                                            int Vpos = i * PyArray_STRIDES(%(V)s)[0] + z * PyArray_STRIDES(%(V)s)[4] +  (dr * p+k) * PyArray_STRIDES(%(V)s)[1] +  (dc*q+l) * PyArray_STRIDES(%(V)s)[2] + m * PyArray_STRIDES(%(V)s)[3];

                                            for (int r = 0; r < outputDur; r++) {
                                                writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
                                                //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
                                                Hpos += dhs3;
                                                Vpos += dtvs3;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

}}}}}}} // extra scope for fail
            ///////////// < /code generated by ConvGradW3D >
        """

        return strutil.renderString(codeSource, locals())
Exemplo n.º 8
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        V, W, b, d = inputs
        fail = sub['fail']

        H = outputs[0]

        codeSource = """
            ///////////// < code generated by Conv3D >

            //printf("\t\t\t\tConv3D c code\\n");

            //Check dimensionality of inputs
            if (PyArray_NDIM(%(W)s) != 5)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor");
                            %(fail)s

            }

            if (PyArray_NDIM(%(V)s) != 5)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor");
                            %(fail)s
            }

            if (PyArray_NDIM(%(b)s) != 1)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: b must be a vector.");
                %(fail)s
            }

            if (PyArray_NDIM(%(d)s) != 1)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: d must be a vector.");
                %(fail)s
            }

            if (PyArray_DIMS(%(d)s)[0] != 3)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0]);
                %(fail)s
            }

            //Read and check sizes of inputs
{ // exta scope so error handler jumps don't cause errors
            const int batchSize = PyArray_DIMS(%(V)s)[0];
            const int outputChannels =  PyArray_DIMS(%(W)s)[0];
            const int inputChannels = PyArray_DIMS(%(V)s)[4];

            if (PyArray_DIMS(%(W)s)[4] != inputChannels)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%ld channel image but the image has %%d channels. Overall shape of input: (%%ld,%%ld,%%ld,%%ld,%%ld)", (long)PyArray_DIMS(%(W)s)[4], inputChannels, (long)PyArray_DIMS(%(V)s)[0], (long)PyArray_DIMS(%(V)s)[1], (long)PyArray_DIMS(%(V)s)[2], (long)PyArray_DIMS(%(V)s)[3], (long)PyArray_DIMS(%(V)s)[4]);
                %(fail)s
            }

            if (PyArray_DIMS(%(b)s)[0] != outputChannels)
            {
                PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%ld channel output image but the output has %%d channels", (long)PyArray_DIMS(%(b)s)[0], outputChannels);
                %(fail)s
            }

{  //extra scope so error handler jumps don't cause errors
            const int filterHeight = PyArray_DIMS(%(W)s)[1];
            const int filterWidth = PyArray_DIMS(%(W)s)[2];
            const int filterDur = PyArray_DIMS(%(W)s)[3];
            const int vidHeight = PyArray_DIMS(%(V)s)[1];
            const int vidWidth = PyArray_DIMS(%(V)s)[2];
            const int vidDur = PyArray_DIMS(%(V)s)[3];\

            if (vidHeight < filterHeight)
            {
                PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
                %(fail)s
            }

{ // extra scope so fail works

            if (vidWidth < filterWidth)
            {
                PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
                %(fail)s
            }

{ // extra scope so fail works

            if (vidDur < filterDur)
            {
                PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
                %(fail)s
            }

{ // extra scope so fail works

            //Read and check stride arguments
            const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0);
            const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1);
            const int dt = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,2);

            if (dr <= 0 || dc <= 0 || dt <= 0)
            {
                PyErr_Format(PyExc_ValueError,"Conv3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
                %(fail)s
            }
{ // extra scope so fail works

            //Make correctly sized output
            const long long outputHeight = int( (vidHeight - filterHeight) / dr )+1;
            const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1;
            const long long outputDur = int( (vidDur - filterDur) / dt ) +1;


            npy_intp dims[5];
            dims[0] = batchSize;
            dims[4] = outputChannels;
            dims[1] = outputHeight;
            dims[2] = outputWidth;
            dims[3] = outputDur;



            if(!(%(H)s) || PyArray_DIMS(%(H)s)[0]!=dims[0] ||
            PyArray_DIMS(%(H)s)[1]!=dims[1] ||
            PyArray_DIMS(%(H)s)[2]!=dims[2] ||
            PyArray_DIMS(%(H)s)[3]!=dims[3] ||
            PyArray_DIMS(%(H)s)[4]!=dims[4]){
                Py_XDECREF(%(H)s);
                %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num);
                if (!(%(H)s)) {
                    PyErr_Format(PyExc_MemoryError,"Conv3D: Could not allocate output.");
                    %(fail)s
                }
            }
{ // extra scope so fail works


            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )


            const int ws0 = PyArray_STRIDES(%(W)s)[0];
            const int ws1 = PyArray_STRIDES(%(W)s)[1];
            const int ws2 = PyArray_STRIDES(%(W)s)[2];
            const int vs1 = PyArray_STRIDES(%(V)s)[1];
            const int ws4 = PyArray_STRIDES(%(W)s)[4];
            const int vs4 = PyArray_STRIDES(%(V)s)[4];
            const int ws3 = PyArray_STRIDES(%(W)s)[3];
            const int vs3 = PyArray_STRIDES(%(V)s)[3];
            const int vs2 = PyArray_STRIDES(%(V)s)[2];
            const int bs  = PyArray_STRIDES(%(b)s)[0];
            const int hs4 = PyArray_STRIDES(%(H)s)[4];



            // Compute H
            //H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m]
            //TODO: add special cases
            // ex: filterDur == 1 && batchSize == 1 && dt = 1  (for SFA)
            // ex: inputChannels == 1 """

        #if the data types are not mixed, we can insert special case optimizations based on BLAS
        VV, WV, bv, dv = node.inputs
        HV = node.outputs[0]
        if (theano.config.blas.ldflags and VV.dtype == WV.dtype
                and HV.dtype == VV.dtype):
            if VV.dtype == 'float64':
                gemv = 'dgemv_'
            elif VV.dtype == 'float32':
                gemv = 'sgemv_'
            else:
                raise Exception('Unrecognized dtype for convolution ' +
                                V.value.dtype)

            codeSource += """
            if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0)))
            {
              //std::cout << "lots of channels special case code" << std::endl;
              #define blas_type dtype_ ## %(V)s
              const blas_type  constant_one = 1.0;
              char N = 'T';
              int ws0e = ws0 / sizeof(ELEM_AT(%(W)s,0));
              int vs4e = vs4 / sizeof(ELEM_AT(%(V)s,4));
              int hs4e = hs4 / sizeof(ELEM_AT(%(H)s,4));

                //special case code for the "lots of channels" case
                //uses a BLAS matrix vector multiply to compute the contribute for
                //all channels of an input pixel to all channels of an output pixel
                //simultaneously
              long long Hpos = 0;
              long long Vpos = 0;
              for (int i = 0; i < batchSize; i++) {
                    long long Hposi = Hpos;
                    long long Vposi = Vpos;


                    for (int r = 0;  r < outputHeight; r++) {
                      long long Hposr = Hpos;
                      long long Vposr = Vpos;
                      for (int c = 0; c < outputWidth; c++) {
                       long long Hposc = Hpos;
                       long long Vposc = Vpos;
                       for (int t = 0; t < outputDur; t++) {
                            long long Hpost = Hpos;
                            long long Vpost = Vpos;
                            //of the loops so far, j should be the innermost, because
                            //each loop through j visits the same elements of V
                            //this implies that the last index of H should be the j index
                            //since V and H should have the same format, this means
                            //z should be the last index in v, and therefore the innermost
                            //of the next set of for loops

                            int Wpos = 0;
                            int bPos = 0;


                            long long Hposj = Hpos;
                            for (int j = 0; j < outputChannels; j++) {
                                // H[i,r,c,t,j] = b[j]
                                ELEM_AT(%(H)s,Hposj) = ELEM_AT(%(b)s,bPos);
                                Hposj += hs4;
                                bPos += bs;
                            }

                            dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos);


                            for (int k =0; k < filterHeight; k++) {
                                  int Wposk = Wpos;
                                  long long Vposk = Vpos;
                                  for (int l = 0; l < filterWidth; l++) {
                                    int Wposl = Wpos;
                                    long long Vposl = Vpos;
                                    for (int m = 0; m < filterDur; m++) {

                                      //H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:])


                                      //note: changing the weights so that outputChannels and inputChannels were the last two rather than
                                      //the first and last elements did not speed this up, even for extremely large input sizes

                                      %(gemv)s(&N, & inputChannels, & outputChannels,
                     &constant_one, & ELEM_AT( %(W)s , Wpos),& ws0e,
                     & ELEM_AT(%(V)s, Vpos),& vs4e, &constant_one,
                     writePos,& hs4e);

                                      Wpos  += ws3;
                                      Vpos  += vs3;
                                    } // close m
                                    Wpos = Wposl + ws2;
                                    Vpos = Vposl + vs2;
                                  } //close l
                                  Wpos = Wposk + PyArray_STRIDES(%(W)s)[1];
                                  Vpos = Vposk + PyArray_STRIDES(%(V)s)[1];
                                } //close k
                             Hpos = Hpost + PyArray_STRIDES(%(H)s)[3];
                             Vpos = Vpost + vs3 * dt;
                         } //close t
                         Hpos = Hposc + PyArray_STRIDES(%(H)s)[2];
                         Vpos = Vposc + vs2 * dc;
                       } //close c
                       Hpos = Hposr + PyArray_STRIDES(%(H)s)[1];
                       Vpos = Vposr + PyArray_STRIDES(%(V)s)[1] * dr;
                   } //closes r
                   Hpos = Hposi + PyArray_STRIDES(%(H)s)[0];
                   Vpos = Vposi + PyArray_STRIDES(%(V)s)[0];
              } //closes i


            } //closes "lots of channels" special case code
            else
"""

        codeSource += """
            {
              //General case code
              //std::cout << "general case code" << std::endl;
              long long Hpos = 0;
              long long Vpos = 0;
              for (int i = 0; i < batchSize; i++) {
                    long long Hposi = Hpos;
                    long long Vposi = Vpos;


                    for (int r = 0;  r < outputHeight; r++) {
                      long long Hposr = Hpos;
                      long long Vposr = Vpos;
                      for (int c = 0; c < outputWidth; c++) {
                       long long Hposc = Hpos;
                       long long Vposc = Vpos;
                       for (int t = 0; t < outputDur; t++) {
                            long long Hpost = Hpos;
                            long long Vpost = Vpos;
                            //of the loops so far, j should be the innermost, because
                            //each loop through j visits the same elements of V
                            //this implies that the last index of H should be the j index
                            //since V and H should have the same format, this means
                            //z should be the last index in v, and therefore the innermost
                            //of the next set of for loops

                            int Wpos = 0;
                            int bPos = 0;


                            for (int j = 0; j < outputChannels; j++) {


                                long long Hposj = Hpos;
                                long long Vposj = Vpos;
                                int Wposj = Wpos;

                                // H[i,r,c,t,j] = b[j]

                                dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);


                                writePos = ELEM_AT(%(b)s,bPos);


                                for (int k =0; k < filterHeight; k++) {
                                  int Wposk = Wpos;
                                  long long Vposk = Vpos;
                                  for (int l = 0; l < filterWidth; l++) {
                                    int Wposl = Wpos;
                                    long long Vposl = Vpos;
                                    for (int m = 0; m < filterDur; m++) {
                                      int Wposm = Wpos;
                                      long long Vposm = Vpos;
                                      for (int z = 0; z < inputChannels; z++) {
                                        //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]


                                        writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);

                                        Wpos += ws4;
                                        Vpos += vs4;
                                      } // close z
                                      Wpos = Wposm + ws3;
                                      Vpos = Vposm + vs3;
                                    } // close m
                                    Wpos = Wposl + ws2;
                                    Vpos = Vposl + vs2;
                                  } //close l
                                  Wpos = Wposk + PyArray_STRIDES(%(W)s)[1];
                                  Vpos = Vposk + PyArray_STRIDES(%(V)s)[1];
                                } //close k


                              bPos += bs;
                              Wpos = Wposj + ws0;
                              Hpos = Hposj +  hs4;
                              Vpos = Vposj;
                              //std::cout << "incremented Wpos by " << ws0 << std::endl;
                              //std::cout << "incremented Hpos by " << hs4 << std::endl;
                             } //close j
                             Hpos = Hpost + PyArray_STRIDES(%(H)s)[3];
                             Vpos = Vpost + vs3 * dt;
                         } //close t
                         Hpos = Hposc + PyArray_STRIDES(%(H)s)[2];
                         Vpos = Vposc + vs2 * dc;
                       } //close c
                       Hpos = Hposr + PyArray_STRIDES(%(H)s)[1];
                       Vpos = Vposr + PyArray_STRIDES(%(V)s)[1] * dr;
                   } //closes r
                   Hpos = Hposi + PyArray_STRIDES(%(H)s)[0];
                   Vpos = Vposi + PyArray_STRIDES(%(V)s)[0];
              } //closes i
            } //closes general case code
}}}}}}} //extra scope so error handler jumps don't cross declarations
            ///////////// < /code generated by Conv3D >
        """

        return strutil.renderString(codeSource, locals())
Exemplo n.º 9
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        W, b, d, H, RShape = inputs
        fail = sub['fail']

        R = outputs[0]

        codeSource =  """
            ///////////// < code generated by GpuConvTransp3D >

            //printf("\t\t\t\tGpuConvTransp c code\\n");

            //Check dimensionality of inputs
            if (%(H)s->nd != 5)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
                %(fail)s
            }

            if (%(W)s->nd != 5)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
                %(fail)s
            }

            if (%(b)s->nd != 1)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
                %(fail)s
            }

            if (%(d)s->nd != 1)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
                %(fail)s
            }

            //Read and check stride arguments
            if (%(d)s->dimensions[0] != 3)
            {
                PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
                %(fail)s
            }
{ // for fail
            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
            if (dr <= 0 || dc <= 0 || dt <= 0)
            {
                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
                %(fail)s
            }


            //Read and check sizes of inputs

{ // for fail
            const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
            const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];

            if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
            {
                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
                %(fail)s
            }
{ // for fail

            const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];

            if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
            {
                PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels );
                %(fail)s
            }
{ // for fail

            const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
            const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
            const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
            const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
            const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
            const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];

            int videoHeight = (outputHeight-1) * dr + filterHeight;
            int videoWidth = (outputWidth-1) * dc + filterWidth;
            int videoDur = (outputDur-1) * dt + filterDur;


            if (%(RShape)s)
            {
                if (%(RShape)s->nd != 1)
                {
                    PyErr_Format(PyExc_ValueError, "RShape must be a vector");
                    %(fail)s
                }

                if (%(RShape)s->dimensions[0] != 3)
                {
                    PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
                    %(fail)s
                }
{ // for fail

                                dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
                                dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
                                dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);

                if (RShape0 != -1)
                {
                    if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
                    {
                        PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
                        %(fail)s
                    }

                    videoHeight = RShape0;
                    videoWidth = RShape1;
                    videoDur = RShape2;
                }
            }

            //Allocate the reconstruction
            npy_intp dims[5];
            dims[0] = batchSize;
            dims[4] = inputChannels;
            dims[1] = videoHeight;
            dims[2] = videoWidth;
            dims[3] = videoDur;

                        if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
                    Py_XDECREF(%(R)s);
               %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
                if (!(%(R)s)) {
                    PyErr_Format(PyExc_MemoryError,"Could not allocate R");
                    %(fail)s;
                }
                        }
            cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);

{ // for fail

bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]);
bool work_complete = false;

const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4];
const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3];
const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2];
const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1];
const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0];


if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){
    //conv_transp_rows_stack
    dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
    dim3 threads(videoDur);

HERE

    int shared_size=0;
        conv_transp_rows_stack<<<grid, threads, shared_size>>>(
        CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s),
        videoHeight, videoWidth, videoDur,
        filterHeight, filterWidth, filterDur,
        outputHeight, outputWidth, outputDur,
        outputChannels, inputChannels,
        dr,dc,dt,
        hs3,hs2,hs1,hs4,hs0,
        ws3,ws2,ws1,ws4,ws0,
        CudaNdarray_HOST_STRIDES(%(b)s)[0]);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
            work_complete = true;
        if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
        }
        else
        {
            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts));
            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
                    cudaGetErrorString(sts));
            %(fail)s
        }


}

if(!work_complete){
            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d",
                         out_contiguous,b_strided,outputDur);
            %(fail)s
}



}}}}}} // for fail
            ///////////// < /code generated by GpuConvTransp3D >
        """
        return strutil.renderString(codeSource,locals())
Exemplo n.º 10
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        V, W, b, d = inputs
        fail = sub['fail']

        H = outputs[0]

        codeSource = """
                        ///////////// < code generated by GpuConv3D >

                        //printf("\t\t\t\tConv3DGPU c code\\n");

                        //Check dimensionality of inputs
                        if (%(W)s->nd != 5)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
                            %(fail)s
                        }

                        if (%(V)s->nd != 5)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
                            %(fail)s
                        }

                        if (%(b)s->nd != 1)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
                            %(fail)s
                        }

                        if (%(d)s->nd != 1)
                        {
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
                            %(fail)s

                        }
                        if (%(d)s->dimensions[0] != 3)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
                            %(fail)s

                        }

{ //extra scope so fail doesn't jump over declarations
                        //Read and check sizes of inputs
                        const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
                        const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];
                        const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
                        if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
                        {
                            PyErr_Format(PyExc_ValueError, "GpuConv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
                            %(fail)s
                        }
{  //extra scope so error handler jumps don't cause errors
                        const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
                        const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
                        const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
                        const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
                        const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
                        const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
            if (vidHeight < filterHeight)
            {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
                %(fail)s
            }
{ // extra scope so fail works
            if (vidWidth < filterWidth)
            {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
                %(fail)s
            }
{ // extra scope so fail works
            if (vidDur < filterDur)
            {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
                %(fail)s
            }
{ // extra scope so fail works

                        //Read and check stride arguments
                        const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
                        const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
                        const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
                        if (dr <= 0 || dc <= 0 || dt <= 0)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
                %(fail)s
                        }
{ // extra scope so fail works

                        //Make correctly sized output
                        const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
                        const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
                        const int outputDur = int( (vidDur - filterDur) / dt ) +1;

                        npy_intp dims[5];
                        dims[0] = batchSize;
                        dims[4] = outputChannels;
                        dims[1] = outputHeight;
                        dims[2] = outputWidth;
                        dims[3] = outputDur;

                        if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
                                Py_XDECREF(%(H)s);
                                %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
                                if (!(%(H)s)) {

                    PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
                            %(fail)s
                                }
                        }
{ // extra scope so fail will not cross declarations
                        //#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################

                        const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
                        const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
                        const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
                        const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
                        const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
                        const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
                        const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
                        const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
                        const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
                        const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];

                        // Compute H
                        //H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]

bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
bool work_complete = false;

if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){
    //conv_rows_stack
    dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
    dim3 threads(outputDur);

    int shared_size=0;
        conv_rows_stack<<<grid, threads, shared_size>>>(
        CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
        vidHeight, vidWidth, vidDur,
        filterHeight, filterWidth, filterDur,
        outputChannels, inputChannels,
        dr,dc,dt,
        vs3,vs2,vs1,vs4,vs0,
        ws3,ws2,ws1,ws4,ws0);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
            work_complete = true;
            if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
        }
        else
        {
            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
                    cudaGetErrorString(sts));
            %(fail)s
        }


}

if(!work_complete){
            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
            %(fail)s
}

}}}}}}} //extra scope so error handler jumps don't cross declarations
                        ///////////// < /code generated by GpuConv3D >
        """
        return strutil.renderString(codeSource, locals())
Exemplo n.º 11
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        V, W, b, d = inputs
        fail = sub['fail']

        H = outputs[0]

        codeSource =  """
                        ///////////// < code generated by GpuConv3D >

                        //printf("\t\t\t\tConv3DGPU c code\\n");

                        //Check dimensionality of inputs
                        if (%(W)s->nd != 5)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
                            %(fail)s
                        }

                        if (%(V)s->nd != 5)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
                            %(fail)s
                        }

                        if (%(b)s->nd != 1)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
                            %(fail)s
                        }

                        if (%(d)s->nd != 1)
                        {
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
                            %(fail)s

                        }
                        if (%(d)s->dimensions[0] != 3)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
                            %(fail)s

                        }

{ //extra scope so fail doesn't jump over declarations
                        //Read and check sizes of inputs
                        const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
                        const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];
                        const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
                        if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
                        {
                            PyErr_Format(PyExc_ValueError, "GpuConv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
                            %(fail)s
                        }
{  //extra scope so error handler jumps don't cause errors
                        const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
                        const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
                        const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
                        const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
                        const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
                        const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
            if (vidHeight < filterHeight)
            {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
                %(fail)s
            }
{ // extra scope so fail works
            if (vidWidth < filterWidth)
            {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
                %(fail)s
            }
{ // extra scope so fail works
            if (vidDur < filterDur)
            {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
                %(fail)s
            }
{ // extra scope so fail works

                        //Read and check stride arguments
                        const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
                        const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
                        const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
                        if (dr <= 0 || dc <= 0 || dt <= 0)
                        {
                PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
                %(fail)s
                        }
{ // extra scope so fail works

                        //Make correctly sized output
                        const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
                        const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
                        const int outputDur = int( (vidDur - filterDur) / dt ) +1;

                        npy_intp dims[5];
                        dims[0] = batchSize;
                        dims[4] = outputChannels;
                        dims[1] = outputHeight;
                        dims[2] = outputWidth;
                        dims[3] = outputDur;

                        if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
                        CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
                                Py_XDECREF(%(H)s);
                                %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
                                if (!(%(H)s)) {

                    PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
                            %(fail)s
                                }
                        }
{ // extra scope so fail will not cross declarations
                        //#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################

                        const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
                        const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
                        const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
                        const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
                        const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
                        const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
                        const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
                        const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
                        const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
                        const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];

                        // Compute H
                        //H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]

bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
bool work_complete = false;

if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){
    //conv_rows_stack
    dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
    dim3 threads(outputDur);

    int shared_size=0;
        conv_rows_stack<<<grid, threads, shared_size>>>(
        CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
        vidHeight, vidWidth, vidDur,
        filterHeight, filterWidth, filterDur,
        outputChannels, inputChannels,
        dr,dc,dt,
        vs3,vs2,vs1,vs4,vs0,
        ws3,ws2,ws1,ws4,ws0);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
            work_complete = true;
            if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
        }
        else
        {
            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
            if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
                    cudaGetErrorString(sts));
            %(fail)s
        }


}

if(!work_complete){
            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
            %(fail)s
}

}}}}}}} //extra scope so error handler jumps don't cross declarations
                        ///////////// < /code generated by GpuConv3D >
        """
        return strutil.renderString(codeSource,locals())