def prob_max_pool_c01b(c01b, pool_shape, top_down = None): if pool_shape[0] != pool_shape[1]: raise UnimplementedError("Non sqaure pool shapes are not supported yet") assert pool_shape[0] > 0 ch, zr, zc, batch_size = c01b.shape r, c = pool_shape if top_down is None: top_down = tensor.zeros((ch, zr / r, zc / c, batch_size), dtype = c01b.dtype) op = ProbMaxPool(pool_shape[0]) c01b = gpu_contiguous(c01b) top_down = gpu_contiguous(top_down) return op(c01b, top_down)
def c_code(self, node, name, inputs, outputs, sub): hid_acts, filters = inputs targets, = outputs fail = sub['fail'] # convFilterActs will multiply targets by scaleTargets # then add scaleOutput * (the convolution value) # We could make use of this to implement an inplace # addconv op but for this op we just want to compute # the convolution so we set them to 0 and 1 respectively # Note: there is another version of convFilterActs that # does not take these arguments, but it is just a wrapper # around the version that does take them, so we save # a function call by using the version that we use. basic_setup = """ #define scaleTargets 0 #define scaleOutput 1 """ if self.dense_connectivity: basic_setup += """ #define numGroups 1 """ basic_setup += """ #define paddingStart (-%d) """ % self.pad if self.stride != 1: raise UnimplementedError() else: basic_setup += """ #define moduleStride 1 """ if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup += "#define IMAGEACTS_COPY_NON_CONTIGUOUS 0\n" # The amount of braces that must be closed at the end num_braces = 0 # Convert images int nv_hid_acts, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_hid_acts = self._argument_contiguity_check("hid_acts") + """ if (%(hid_acts)s->nd != 4) { PyErr_Format(PyExc_ValueError, "hid_acts must have nd=4, got nd=%%i", %(hid_acts)s->nd); %(fail)s; } { //setup_nv_hid_acts brace 1 const int *hid_act_dims = CudaNdarray_HOST_DIMS(%(hid_acts)s); const int numFilters = hid_act_dims[0]; const int hidActsSizeY = hid_act_dims[1]; const int hidActsSizeX = hid_act_dims[2]; //printf("hidActs shape: %%d %%d\\n", hidActsSizeY, hidActsSizeX); const int batch_size = hid_act_dims[3]; NVMatrix nv_hid_acts(%(hid_acts)s, numFilters * hidActsSizeY * hidActsSizeX, batch_size, "image_acts:nv_hid_acts"); int img_channels = -1; """ num_braces += 1 # Convert filters into nv_filters, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_filters = self._argument_contiguity_check("filters") + """ if (%(filters)s->nd != 4) { PyErr_Format(PyExc_ValueError, "filters must have nd=4, got nd=%%i", %(filters)s->nd); %(fail)s; } { // setup_nv_filters brace 1 const int * filters_dims = CudaNdarray_HOST_DIMS(%(filters)s); const int filter_channels = filters_dims[0]; const int filter_rows = filters_dims[1]; const int filter_cols = filters_dims[2]; const int num_filters = filters_dims[3]; if ((num_filters %% (numGroups * 16)) != 0) { PyErr_Format(PyExc_ValueError, "Each group must have a multiple of 16 channels, but num_filters %%%% (numGroups * 16) = %%d %%%% ( %%d * 16) = %%d.", num_filters, numGroups, num_filters %% (numGroups * 16)); %(fail)s; } if (filter_rows != filter_cols) { PyErr_Format(PyExc_ValueError, "filter must be square, but have shape (%%d, %%d).", filter_rows, filter_cols); %(fail)s; } { // setup_nv_filters brace 2 NVMatrix nv_filters(%(filters)s, filter_channels * filter_rows * filter_cols, num_filters, "img_acts:nv_filters"); """ num_braces += 2 target_rows = "hidActsSizeY + filter_rows - 1 + 2 * paddingStart" target_cols = "hidActsSizeX + filter_cols - 1 + 2 * paddingStart" setup_nv_targets = """ int target_dims [] = { filter_channels, %(target_rows)s, %(target_cols)s, batch_size }; #define numModulesY hid_act_dims[1] #define numModulesX hid_act_dims[2] if (CudaNdarray_prep_output(& %(targets)s, 4, target_dims)) { %(fail)s; } { // setup_nv_filters brace # 1 const int imgSizeY = %(target_rows)s; const int imgSizeX = %(target_cols)s; NVMatrix nv_targets(%(targets)s, target_dims[0] * target_dims[1] * target_dims[2], target_dims[3], "image_acts: nv_targets"); """ num_braces += 1 # note: numFilters is not specified here. it is determined by # nv_filters.getNumCols() # # note: the size of the filters is determined by dividing # nv_filters.getNumRows() by numFilterColors # do_convolution = """ convImgActs(nv_hid_acts, nv_filters, nv_targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, filter_channels, numGroups); """ braces = '}' * num_braces rval = basic_setup + \ setup_nv_hid_acts + \ setup_nv_filters + \ setup_nv_targets + \ do_convolution + \ braces rval = rval % locals() return rval
def c_code(self, node, name, inputs, outputs, sub): """ .. todo:: WRITEME """ partial_sum = self.partial_sum if self.partial_sum is not None else 0 images, hid_grads, output_shape = inputs weights_grads, partialsum_storage = outputs fail = sub['fail'] pad = self.pad # convFilterActs will multiply targets by scaleTargets # then add scaleOutput * (the convolution value) # We could make use of this to implement an inplace # addconv op but for this op we just want to compute # the convolution so we set them to 0 and 1 respectively # Note: there is another version of convFilterActs that # does not take these arguments, but it is just a wrapper # around the version that does take them, so we save # a function call by using the version that we use. basic_setup = """ #define scaleTargets 0 #define scaleOutput 1 """ if self.dense_connectivity: basic_setup += """ #define numGroups 1 """ basic_setup += """ #define paddingStart (-%(pad)d) const int *hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s); const int hidGradsSizeY = hid_grads_dims[1]; const int hidGradsSizeX = hid_grads_dims[2]; const int numModules = hidGradsSizeX * hidGradsSizeY; int partialSum = %(partial_sum)d > 0 ? %(partial_sum)d : numModules; // using this expression instead of numModules %% partialSum // because nvcc+msvc9 yield a strange behaviour when using %% if ( numModules - (numModules / partialSum) * partialSum != 0) { PyErr_Format(PyExc_ValueError, "partialSum must divide numModules, but partialSum=%%d and " "numModules=%%d", partialSum, numModules); %(fail)s; } """ basic_setup += """ #define moduleStride %d """ % self.stride if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup += "#define WEIGHTACTS_COPY_NON_CONTIGUOUS 0\n" # The amount of braces that must be closed at the end num_braces = 0 # Convert images int nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_images = self._argument_contiguity_check("images") + """ if (%(images)s->nd != 4) { PyErr_Format(PyExc_ValueError, "images must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s); const int img_channels = images_dims[0]; if (img_channels > 3 && img_channels %% 4 != 0) { PyErr_Format(PyExc_ValueError, "images must have 3 or fewer channels, or have a multiple of 4 channels, got %%i", img_channels); %(fail)s; } { //setup_nv_images brace 2 const int * hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s); const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "weight_acts: nv_images"); """ num_braces += 2 # Convert hid_grads int nv_hid_grads, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_hid_grads = self._argument_contiguity_check("hid_grads") + """ if (%(hid_grads)s->nd != 4) { PyErr_Format(PyExc_ValueError, "hid_grads must have nd=4, got nd=%%i", %(hid_grads)s->nd); %(fail)s; } { //setup_nv_hid_grads brace 1 const int numFilters = hid_grads_dims[0]; const int batch_size = hid_grads_dims[3]; NVMatrix nv_hid_grads(%(hid_grads)s, numFilters * hidGradsSizeY * hidGradsSizeX, batch_size, "weight_acts:nv_hid_grads"); """ num_braces += 1 setup_nv_weights_grads = """ int filters_dims[4]; // filters: (input channels, filter rows, filter cols, output channels) npy_intp *shape_dims = PyArray_DIMS(%(output_shape)s); npy_intp target_rows, target_cols; PyArrayObject *casted_shape; PyArray_Descr *intp_dtype; if (PyArray_NDIM(%(output_shape)s) != 1) { PyErr_Format(PyExc_ValueError, "output shape must be a vector, got %%d-tensor", PyArray_NDIM(%(output_shape)s)); %(fail)s; } else if (shape_dims[0] != 2) { PyErr_Format(PyExc_ValueError, "output shape must be length 2, got %%d", (int)shape_dims[0]); %(fail)s; } else if ((PyArray_DESCR(%(output_shape)s))->kind != 'i' && (PyArray_DESCR(%(output_shape)s))->kind != 'u') { PyErr_SetString(PyExc_TypeError, "output shape must have integer or uint dtype"); %(fail)s; } intp_dtype = PyArray_DescrFromType(NPY_INTP); casted_shape = (PyArrayObject *)PyArray_CastToType(%(output_shape)s, intp_dtype, 0); target_rows = *((npy_intp *)PyArray_GETPTR1(casted_shape, 0)); target_cols = *((npy_intp *)PyArray_GETPTR1(casted_shape, 1)); filters_dims[0] = img_channels; filters_dims[1] = target_rows; filters_dims[2] = target_cols; if (filters_dims[1] != filters_dims[2]) { PyErr_Format(PyExc_ValueError, "filter must be square, but have shape (%%d, %%d).", filters_dims[1], filters_dims[2]); %(fail)s; } else if (moduleStride > filters_dims[1]) { PyErr_Format(PyExc_ValueError, "stride %%d greater than filter size (%%d, %%d)", moduleStride, filters_dims[1], filters_dims[2]); %(fail)s; } filters_dims[3] = numFilters; const int filterSize = filters_dims[1]; int partialsum_storage_dims[5]; for (int i = 1; i < 5; i++) { partialsum_storage_dims[i] = filters_dims[i - 1]; } partialsum_storage_dims[0] = numModules / partialSum; if (partialSum != numModules && CudaNdarray_prep_output(&%(partialsum_storage)s, 5, partialsum_storage_dims)) { %(fail)s; } for (int i = 0; i < 4; i++) { if (filters_dims[i] <= 0) { printf("filters_dims[%%d] = %%d\\n", i, filters_dims[i]); assert(false); } } if (CudaNdarray_prep_output(& %(weights_grads)s, 4, filters_dims)) { %(fail)s; } { // setup_nv_weights_grad brace # 1 NVMatrix nv_weights_grads(%(weights_grads)s, filters_dims[0] * filterSize * filterSize, numFilters, "weight_acts:nv_weights_grads"); """ num_braces += 1 # note: imgSizeX is not specified here, it is computed internally # (in _filterActsSparse) by the lines: # int imgPixels = images.getNumRows() / numImgColors; # int imgSizeX = imgPixels / imgSizeY; # # note: numFilters is not specified here. it is determined by # nv_filters.getNumCols() # # note: the size of the filters is determined by dividing # nv_filters.getNumRows() by numFilterColors # run_kernel = """ if (partialSum == numModules) _weightActs(nv_images, nv_hid_grads, nv_weights_grads, imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize, paddingStart, moduleStride, img_channels, numGroups, partialSum, 0, 1); else { NVMatrix nv_partialsum(%(partialsum_storage)s, (numModules / partialSum) * filters_dims[0] * filterSize * filterSize, numFilters, "weight_acts: nv_partialsum"); _weightActs(nv_images, nv_hid_grads, nv_partialsum, imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize, paddingStart, moduleStride, img_channels, numGroups, partialSum, 0, 1); nv_partialsum.reshape((numModules / partialSum), filters_dims[0] * filterSize * filterSize * numFilters); // sum out axis 0 of nv_partialsum #define AXIS 0 // scale the contents of nv_weights_grads by 0 // i.e., clear out its pre-existing content #define SCALE_THIS 0 // scale the new sum by 1, i.e., don't do any scaling #define SCALE_SUM 1 nv_weights_grads.addSum(nv_partialsum, AXIS, SCALE_THIS, SCALE_SUM); } """ braces = '}' * num_braces rval = (basic_setup + setup_nv_images + setup_nv_hid_grads + setup_nv_weights_grads + run_kernel + braces) rval = render_string(rval, locals()) return rval
def c_code(self, node, name, inputs, outputs, sub): p, h, gp, gh, gp_iszero, gh_iszero = inputs targets_z, targets_t, = outputs fail = sub['fail'] # The amount of braces that must be closed at the end num_braces = 0 if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup = "#define PROBMAXPOOLGRAD_COPY_NON_CONTIGUOUS 0\n" # Convert images in nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_h = self._argument_contiguity_check("h") + """ if (%(h)s->nd != 4) { PyErr_Format(PyExc_ValueError, "h must have nd=4, got nd=%%i", %(h)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(h)s); const int img_channels = images_dims[0]; const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; if(imgSizeY != imgSizeX){ PyErr_Format(PyExc_ValueError, "images must be square(dims[1] == dims[2]). Shape (%%i,%%i,%%i,%%i)", img_channels, imgSizeY, imgSizeX, batch_size); %(fail)s; } if(%(ds)s > imgSizeY){ PyErr_Format(PyExc_ValueError, "ds(%%d) must be <= imgSizeX(%%d) and imgSizeY(%%d).", %(ds)s, imgSizeX, imgSizeY); %(fail)s; } if (CudaNdarray_HOST_DIMS(%(h)s)[0] %% 16 != 0) { PyErr_Format(PyExc_ValueError, "h must have a number of channels that is a multiple of 16. Got %%d", CudaNdarray_HOST_DIMS(%(gh)s)[0]); %(fail)s; } NVMatrix nv_h(%(h)s, img_channels * imgSizeY * imgSizeX, batch_size, "ProbMaxPool:nv_h"); """ num_braces += 1 setup_nv_p = self._argument_contiguity_check("p") + """ if (%(p)s->nd != 4) { PyErr_Format(PyExc_ValueError, "P must have nd=4, got nd=%%i", %(p)s->nd); %(fail)s; } { //setup_nv_images brace 1 int _outputsX = ((int)(ceil((imgSizeY - %(start)s - %(ds)s) / ((float)%(stride)s)))) + 1; NVMatrix nv_p(%(p)s, img_channels * _outputsX * _outputsX, batch_size, "ProbMaxPool:nv_p"); """ num_braces += 1 # Convert gh in nv_gh setup_nv_gh = self._argument_contiguity_check("gh") + """ if (%(gh)s->nd != 4) { PyErr_Format(PyExc_ValueError, "gh must have nd=4, got nd=%%i", %(gh)s->nd); %(fail)s; } if (CudaNdarray_HOST_DIMS(%(gh)s)[0] %% 16 != 0) { PyErr_Format(PyExc_ValueError, "gh must have a number of channels that is a multiple of 16. Got %%d", CudaNdarray_HOST_DIMS(%(gh)s)[0]); %(fail)s; } { //setup_nv_gh brace 1 const int * gh_dims = CudaNdarray_HOST_DIMS(%(gh)s); const int gh_channels = gh_dims[0]; const int ghSizeY = gh_dims[1]; const int ghSizeX = gh_dims[2]; NVMatrix nv_gh(%(gh)s, gh_channels * ghSizeY * ghSizeX, batch_size, "ProbMaxPool:nv_gh"); """ num_braces += 1 setup_nv_gp = self._argument_contiguity_check("gp") + """ if (%(gp)s->nd != 4) { PyErr_Format(PyExc_ValueError, "gp must have nd=4, got nd=%%i", %(gp)s->nd); %(fail)s; } { //setup_nv_images brace 1 int _outputsX = ((int)(ceil((imgSizeY - %(start)s - %(ds)s) / ((float)%(stride)s)))) + 1; NVMatrix nv_gp(%(gp)s, img_channels * _outputsX * _outputsX, batch_size, "ProbMaxPool:nv_gp"); """ num_braces += 1 setup_nv_targets_z = """ int target_z_dims [] = { img_channels, imgSizeX, imgSizeY, batch_size }; if (CudaNdarray_prep_output(& %(targets_z)s, 4, target_z_dims)) { %(fail)s; } { // setup_nv_target brace # 1 NVMatrix nv_targets_z(%(targets_z)s, target_z_dims[0] * target_z_dims[1] * target_z_dims[2], target_z_dims[3], "ProbMaxPool:nv_targets_z"); """ num_braces += 1 setup_nv_targets_t = """ int target_t_dims [] = { img_channels, _outputsX, _outputsX, batch_size }; if (CudaNdarray_prep_output(& %(targets_t)s, 4, target_t_dims)) { %(fail)s; } { // setup_nv_target brace # 1 NVMatrix nv_targets_t(%(targets_t)s, target_t_dims[0] * target_t_dims[1] * target_t_dims[2], target_t_dims[3], "ProbMaxPool:nv_targets_t"); float * gp_iszero = CudaNdarray_DEV_DATA(%(gp_iszero)s); float * gh_iszero = CudaNdarray_DEV_DATA(%(gh_iszero)s); """ num_braces += 1 undo_pool = """ localProbMaxUndo(nv_h, nv_p, nv_gh, nv_gp, nv_targets_z, nv_targets_t, %(ds)s, %(start)s, %(stride)s, _outputsX, imgSizeX, gp_iszero, gh_iszero); """ braces = '}' * num_braces rval = (basic_setup + setup_nv_h + setup_nv_p + setup_nv_gh + setup_nv_gp + setup_nv_targets_z + setup_nv_targets_t + undo_pool + braces) start = self.start stride = self.stride ds = self.ds rval = rval % locals() return rval
def c_code(self, node, name, inputs, outputs, sub): images, top_down = inputs ptargets, htargets = outputs fail = sub['fail'] # The amount of braces that must be closed at the end num_braces = 0 if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup = "#define PROBMAXPOOL_COPY_NON_CONTIGUOUS 0\n" # Convert images in nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_images = self._argument_contiguity_check("images") + """ if (%(images)s->nd != 4) { PyErr_Format(PyExc_ValueError, "images must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s); const int img_channels = images_dims[0]; const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; if(imgSizeY != imgSizeX){ PyErr_Format(PyExc_ValueError, "images must be square(dims[1] == dims[2]). Shape (%%i,%%i,%%i,%%i)", img_channels, imgSizeY, imgSizeX, batch_size); %(fail)s; } if(%(ds)s > imgSizeY){ PyErr_Format(PyExc_ValueError, "ds(%%d) must be <= imgSizeX(%%d) and imgSizeY(%%d).", %(ds)s, imgSizeX, imgSizeY); %(fail)s; } if(%(start)s >= imgSizeX){ PyErr_Format(PyExc_ValueError, "start is %%d but must be smaller then the images size of %%d x %%d.", %(start)s, imgSizeX, imgSizeY); %(fail)s; } NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "ProbMaxPool:nv_images"); """ num_braces += 1 # TODO check if stride != pool shape works, if not put error check setup_nv_top_down = self._argument_contiguity_check("top_down") + """ if (%(top_down)s->nd != 4) { PyErr_Format(PyExc_ValueError, "top_down must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 int _outputsX = ((int)(ceil((imgSizeY - %(start)s - %(ds)s) / ((float)%(stride)s)))) + 1; NVMatrix nv_top_down(%(top_down)s, img_channels * _outputsX * _outputsX, batch_size, "ProbMaxPool:nv_top_down"); """ num_braces += 1 setup_nv_ptargets = """ //int _outputsX = ((int)(ceil((imgSizeY - %(start)s - %(ds)s) / ((float)%(stride)s)))) + 1; int target_dims [] = { img_channels, _outputsX, _outputsX, batch_size }; if (CudaNdarray_prep_output(& %(ptargets)s, 4, target_dims)) { %(fail)s; } { // setup_nv_target brace # 1 NVMatrix nv_ptargets(%(ptargets)s, target_dims[0] * target_dims[1] * target_dims[2], target_dims[3], "ProbMaxPool:nv_ptargets"); """ num_braces += 1 setup_nv_htargets = """ int target_dims [] = { img_channels, imgSizeX, imgSizeY, batch_size }; if (CudaNdarray_prep_output(& %(htargets)s, 4, target_dims)) { %(fail)s; } { // setup_nv_target brace # 1 NVMatrix nv_htargets(%(htargets)s, target_dims[0] * target_dims[1] * target_dims[2], target_dims[3], "ProbMaxPool:nv_htargets"); """ num_braces += 1 do_pool = """ probabilisticPool(nv_images, nv_top_down, nv_ptargets, nv_htargets, img_channels, %(ds)s, %(start)s, %(stride)s, _outputsX, MaxPooler()); """ braces = '}' * num_braces rval = (basic_setup + setup_nv_images + setup_nv_top_down + setup_nv_ptargets + setup_nv_htargets + do_pool + braces) start = self.start stride = self.stride ds = self.ds rval = rval % locals() return rval
def c_code(self, node, name, inputs, outputs, sub): """ .. todo:: WRITEME """ images, filters = inputs targets, = outputs fail = sub['fail'] # convFilterActs will multiply targets by scaleTargets # then add scaleOutput * (the convolution value) # We could make use of this to implement an inplace # addconv op but for this op we just want to compute # the convolution so we set them to 0 and 1 respectively # Note: there is another version of convFilterActs that # does not take these arguments, but it is just a wrapper # around the version that does take them, so we save # a function call by using the version that we use. basic_setup = """ #define scaleTargets 0 #define scaleOutput 1 """ if self.dense_connectivity: basic_setup += """ #define numGroups 1 """ assert isinstance(self.pad, py_integer_types) assert self.pad >= 0, "pad must be non-negative" basic_setup += """ #define paddingStart (-%d) """ % self.pad basic_setup += """ #define moduleStride %d """ % int(self.stride) if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup += "#define FILTERACTS_COPY_NON_CONTIGUOUS 0\n" # The amount of braces that must be closed at the end num_braces = 0 # Convert images int nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_images = self._argument_contiguity_check("images") + """ if (%(images)s->nd != 4) { PyErr_Format(PyExc_ValueError, "images must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s); const int img_channels = images_dims[0]; const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "filter_acts:nv_images"); """ num_braces += 1 # Convert filters into nv_filters, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_filters = self._argument_contiguity_check("filters") + """ if (%(filters)s->nd != 4) { PyErr_Format(PyExc_ValueError, "filters must have nd=4, got nd=%%i", %(filters)s->nd); %(fail)s; } { // setup_nv_filters brace 1 const int * filters_dims = CudaNdarray_HOST_DIMS(%(filters)s); const int filter_channels = filters_dims[0]; const int filter_rows = filters_dims[1]; const int filter_cols = filters_dims[2]; const int num_filters = filters_dims[3]; if (numGroups * filter_channels != img_channels) { PyErr_Format(PyExc_ValueError, "# input channels mismatch. images have %%d but filters have %%d groups of %%d for a total of %%d.", img_channels, numGroups, filter_channels, numGroups * filter_channels); %(fail)s; } if ((num_filters %% (numGroups * 16)) != 0) { PyErr_Format(PyExc_ValueError, "Each group must have a multiple of 16 channels, but num_filters %%%% (numGroups * 16) = %%d %%%% ( %%d * 16) = %%d.", num_filters, numGroups, num_filters %% (numGroups * 16)); %(fail)s; } if (filter_rows != filter_cols) { PyErr_Format(PyExc_ValueError, "filter must be square, but instead have shape (%%d, %%d)", filter_rows, filter_cols); %(fail)s; } else if (moduleStride > filter_rows) { PyErr_Format(PyExc_ValueError, "stride %%d greater than filter size (%%d, %%d)", moduleStride, filter_rows, filter_cols); %(fail)s; } { // setup_nv_filters brace 2 NVMatrix nv_filters(%(filters)s, filter_channels * filter_rows * filter_cols, num_filters, "filter_acts:nv_filters"); """ num_braces += 2 # p + (m_x - 1) * s + f >= i_x # p + (m_x - 1) * s >= i_x - f # m_x = (i_x - f - p) / s + 1 div_ms_y = "((imgSizeY - 2*paddingStart - filter_rows) / moduleStride)" div_ms_x = "((imgSizeX - 2*paddingStart - filter_cols) / moduleStride)" mod_ms_y = "((imgSizeY - 2*paddingStart - filter_rows) % moduleStride)" mod_ms_x = "((imgSizeX - 2*paddingStart - filter_cols) % moduleStride)" target_rows = "%s + ((%s > 0) ? 1 : 0) + 1" % (div_ms_y, mod_ms_y) target_cols = "%s + ((%s > 0) ? 1 : 0) + 1" % (div_ms_x, mod_ms_x) setup_nv_targets = """ int target_dims [] = { num_filters, %(target_rows)s, %(target_cols)s, batch_size }; #define numModulesY target_dims[1] #define numModulesX target_dims[2] if (CudaNdarray_prep_output(& %(targets)s, 4, target_dims)) { %(fail)s; } { // setup_nv_filters brace # 1 NVMatrix nv_targets(%(targets)s, target_dims[0] * target_dims[1] * target_dims[2], target_dims[3], "filter_acts:nv_targets"); """ num_braces += 1 # note: imgSizeX is not specified here, it is computed internally # (in _filterActsSparse) by the lines: # int imgPixels = images.getNumRows() / numImgColors; # int imgSizeX = imgPixels / imgSizeY; # # note: numFilters is not specified here. it is determined by # nv_filters.getNumCols() # # note: the size of the filters is determined by dividing # nv_filters.getNumRows() by numFilterColors # do_convolution = """ convFilterActs(nv_images, nv_filters, nv_targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, img_channels, numGroups, scaleTargets, scaleOutput); """ braces = '}' * num_braces rval = basic_setup + \ setup_nv_images + \ setup_nv_filters + \ setup_nv_targets + \ do_convolution + \ braces rval = rval % locals() return rval
def c_code(self, node, name, inputs, outputs, sub): """ .. todo:: WRITEME """ images, maxout, gz = inputs targets, = outputs fail = sub['fail'] # The amount of braces that must be closed at the end num_braces = 0 if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup = "#define MAXPOOLGRAD_COPY_NON_CONTIGUOUS 0\n" # Convert images in nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_images = self._argument_contiguity_check("images") + """ if (%(images)s->nd != 4) { PyErr_Format(PyExc_ValueError, "images must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s); const int img_channels = images_dims[0]; const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; if(imgSizeY != imgSizeX){ PyErr_Format(PyExc_ValueError, "images must be square(dims[1] == dims[2]). Shape (%%i,%%i,%%i,%%i)", img_channels, imgSizeY, imgSizeX, batch_size); %(fail)s; } if(%(ds)s > imgSizeY){ PyErr_Format(PyExc_ValueError, "ds(%%d) must be <= imgSizeX(%%d) and imgSizeY(%%d).", %(ds)s, imgSizeX, imgSizeY); %(fail)s; } NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "MaxPool:nv_images"); """ num_braces += 1 # Convert maxout in nv_maxout setup_nv_maxout = self._argument_contiguity_check("maxout") + """ if (%(maxout)s->nd != 4) { PyErr_Format(PyExc_ValueError, "maxout must have nd=4, got nd=%%i", %(maxout)s->nd); %(fail)s; } { //setup_nv_maxout brace 1 const int * maxout_dims = CudaNdarray_HOST_DIMS(%(maxout)s); const int maxout_channels = maxout_dims[0]; const int maxoutSizeY = maxout_dims[1]; const int maxoutSizeX = maxout_dims[2]; if(maxoutSizeY != maxoutSizeX){ PyErr_Format(PyExc_ValueError, "maxout must be square(dims[1] == dims[2])." " Shape (%%i,%%i,%%i,%%i)", maxout_channels, maxoutSizeY, maxoutSizeX, batch_size); %(fail)s; } if(img_channels != maxout_channels){ PyErr_Format(PyExc_ValueError, "img_channels(%%d) should be equal to maxout_channels(%%d).", img_channels, maxout_channels); %(fail)s; } if(maxout_dims[3] != batch_size){ PyErr_Format(PyExc_ValueError, "batch_size(%%d) should be equal to maxout_dims[3](%%d)", batch_size, maxout_dims[3]); %(fail)s; } NVMatrix nv_maxout(%(maxout)s, img_channels * maxoutSizeY * maxoutSizeX, batch_size, "MaxPool:nv_maxout"); """ num_braces += 1 # Convert gz in nv_gz setup_nv_gz = self._argument_contiguity_check("gz") + """ if (%(gz)s->nd != 4) { PyErr_Format(PyExc_ValueError, "gz must have nd=4, got nd=%%i", %(gz)s->nd); %(fail)s; } if (CudaNdarray_HOST_DIMS(%(gz)s)[0] %% 16 != 0) { PyErr_Format(PyExc_ValueError, "gz must have a number of channels that is a multiple of 16. Got %%d", CudaNdarray_HOST_DIMS(%(gz)s)[0]); %(fail)s; } { //setup_nv_gz brace 1 const int * gz_dims = CudaNdarray_HOST_DIMS(%(gz)s); const int gz_channels = gz_dims[0]; const int gzSizeY = gz_dims[1]; const int gzSizeX = gz_dims[2]; if(maxout_dims[0] != gz_dims[0] || maxout_dims[1] != gz_dims[1] || maxout_dims[2] != gz_dims[2] || maxout_dims[3] != gz_dims[3]){ PyErr_Format(PyExc_ValueError, "gz shape(%%d, %%d, %%d, %%d) must be the same" " as maxout(%%d, %%d, %%d, %%d)", maxout_dims[0], maxout_dims[1], maxout_dims[2], maxout_dims[3], gz_dims[0], gz_dims[1], gz_dims[2], gz_dims[3]); %(fail)s; } NVMatrix nv_gz(%(gz)s, img_channels * maxoutSizeY * maxoutSizeX, batch_size, "MaxPool:nv_gz"); """ num_braces += 1 setup_nv_targets = """ //int _outputsX = int(ceil((dic['imgSize'] - dic['start'] - dic['sizeX']) / float(dic['stride']))) + 1; int _outputsX = ((int)(ceil((imgSizeY - %(start)s - %(ds)s) / ((float)%(stride)s)))) + 1; int target_dims [] = { img_channels, imgSizeX, imgSizeY, batch_size }; if (CudaNdarray_prep_output(& %(targets)s, 4, target_dims)) { %(fail)s; } { // setup_nv_target brace # 1 NVMatrix nv_targets(%(targets)s, target_dims[0] * target_dims[1] * target_dims[2], target_dims[3], "MaxPool:nv_targets"); """ num_braces += 1 undo_pool = """ convLocalMaxUndo(nv_images, nv_gz, nv_maxout, nv_targets, %(ds)s, %(start)s, %(stride)s, _outputsX, 0, 1); """ braces = '}' * num_braces rval = (basic_setup + setup_nv_images + setup_nv_maxout + setup_nv_gz + setup_nv_targets + undo_pool + braces) start = self.start stride = self.stride ds = self.ds rval = rval % locals() return rval
def c_code(self, node, name, inputs, outputs, sub): """ .. todo:: WRITEME """ hid_acts, filters, output_shape = inputs targets, = outputs fail = sub['fail'] # convFilterActs will multiply targets by scaleTargets # then add scaleOutput * (the convolution value) # We could make use of this to implement an inplace # addconv op but for this op we just want to compute # the convolution so we set them to 0 and 1 respectively # Note: there is another version of convFilterActs that # does not take these arguments, but it is just a wrapper # around the version that does take them, so we save # a function call by using the version that we use. basic_setup = """ #define scaleTargets 0 #define scaleOutput 1 """ if self.dense_connectivity: basic_setup += """ #define numGroups 1 """ basic_setup += """ #define paddingStart (-%d) """ % self.pad basic_setup += """ #define moduleStride %d """ % self.stride if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup += "#define IMAGEACTS_COPY_NON_CONTIGUOUS 0\n" # The amount of braces that must be closed at the end num_braces = 0 # Convert images int nv_hid_acts, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_hid_acts = self._argument_contiguity_check("hid_acts") + """ if (%(hid_acts)s->nd != 4) { PyErr_Format(PyExc_ValueError, "hid_acts must have nd=4, got nd=%%i", %(hid_acts)s->nd); %(fail)s; } { //setup_nv_hid_acts brace 1 const int *hid_act_dims = CudaNdarray_HOST_DIMS(%(hid_acts)s); const int numFilters = hid_act_dims[0]; const int hidActsSizeY = hid_act_dims[1]; const int hidActsSizeX = hid_act_dims[2]; //printf("hidActs shape: %%d %%d\\n", hidActsSizeY, hidActsSizeX); const int batch_size = hid_act_dims[3]; NVMatrix nv_hid_acts(%(hid_acts)s, numFilters * hidActsSizeY * hidActsSizeX, batch_size, "image_acts:nv_hid_acts"); int img_channels = -1; """ num_braces += 1 # Convert filters into nv_filters, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_filters = self._argument_contiguity_check("filters") + """ if (%(filters)s->nd != 4) { PyErr_Format(PyExc_ValueError, "filters must have nd=4, got nd=%%i", %(filters)s->nd); %(fail)s; } { // setup_nv_filters brace 1 const int * filters_dims = CudaNdarray_HOST_DIMS(%(filters)s); const int filter_channels = filters_dims[0]; const int filter_rows = filters_dims[1]; const int filter_cols = filters_dims[2]; const int num_filters = filters_dims[3]; if ((num_filters %% (numGroups * 16)) != 0) { PyErr_Format(PyExc_ValueError, "Each group must have a multiple of 16 channels, but num_filters %%%% (numGroups * 16) = %%d %%%% ( %%d * 16) = %%d.", num_filters, numGroups, num_filters %% (numGroups * 16)); %(fail)s; } if (filter_rows != filter_cols) { PyErr_Format(PyExc_ValueError, "filter must be square, but have shape (%%d, %%d).", filter_rows, filter_cols); %(fail)s; } else if (moduleStride > filter_rows) { PyErr_Format(PyExc_ValueError, "stride %%d greater than filter size (%%d, %%d)", moduleStride, filter_rows, filter_cols); %(fail)s; } { // setup_nv_filters brace 2 NVMatrix nv_filters(%(filters)s, filter_channels * filter_rows * filter_cols, num_filters, "img_acts:nv_filters"); """ num_braces += 2 #target_rows = "(hidActsSizeY + filter_rows + 2 * paddingStart) * moduleStride - 1" #target_cols = "(hidActsSizeX + filter_cols + 2 * paddingStart) * moduleStride - 1" setup_nv_targets = """ #define numModulesY hid_act_dims[1] #define numModulesX hid_act_dims[2] npy_intp *shape_dims = PyArray_DIMS(%(output_shape)s); npy_intp target_rows, target_cols; PyArrayObject *casted_shape; PyArray_Descr *intp_dtype; if (PyArray_NDIM(%(output_shape)s) != 1) { PyErr_Format(PyExc_ValueError, "output shape must be a vector, got %%d-tensor", PyArray_NDIM(%(output_shape)s)); %(fail)s; } else if (shape_dims[0] != 2) { PyErr_Format(PyExc_ValueError, "output shape must be length 2, got %%d", (int)shape_dims[0]); %(fail)s; } else if ((PyArray_DESCR(%(output_shape)s))->kind != 'i' && (PyArray_DESCR(%(output_shape)s))->kind != 'u') { PyErr_SetString(PyExc_TypeError, "output shape must have integer or uint dtype"); %(fail)s; } intp_dtype = PyArray_DescrFromType(NPY_INTP); casted_shape = (PyArrayObject *)PyArray_CastToType(%(output_shape)s, intp_dtype, 0); target_rows = *((npy_intp *)PyArray_GETPTR1(casted_shape, 0)); target_cols = *((npy_intp *)PyArray_GETPTR1(casted_shape, 1)); { int target_dims [] = { filter_channels, target_rows, target_cols, batch_size }; #define filterSize filter_rows #define MAX_ROWS (paddingStart + (numModulesY-1) * moduleStride + filterSize) if ((target_rows > MAX_ROWS) || (paddingStart + (numModulesX-1) * moduleStride + filterSize < target_cols)) { PyErr_Format(PyExc_ValueError, "pylearn2.sandbox.cuda_convnet.image_acts.ImageActs: incompatible target image size (%%d, %%d), maximum (%%d, %%d)", (int)target_rows, (int)target_cols, (int)MAX_ROWS, (int)(paddingStart + (numModulesX-1) * moduleStride + filterSize)); %(fail)s; } if (CudaNdarray_prep_output(& %(targets)s, 4, target_dims)) { %(fail)s; } { // setup_nv_filters brace # 1 const int imgSizeY = (int)target_rows; const int imgSizeX = (int)target_cols; NVMatrix nv_targets(%(targets)s, target_dims[0] * target_dims[1] * target_dims[2], target_dims[3], "image_acts: nv_targets"); """ num_braces += 2 # note: numFilters is not specified here. it is determined by # nv_filters.getNumCols() # # note: the size of the filters is determined by dividing # nv_filters.getNumRows() by numFilterColors # do_convolution = """ convImgActs(nv_hid_acts, nv_filters, nv_targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, filter_channels, numGroups); """ braces = '}' * num_braces rval = basic_setup + \ setup_nv_hid_acts + \ setup_nv_filters + \ setup_nv_targets + \ do_convolution + \ braces rval = rval % locals() return rval
def c_code(self, node, name, inputs, outputs, sub): partial_sum = self.partial_sum if self.partial_sum is not None else 0 images, hid_grads = inputs weights_grads, = outputs fail = sub['fail'] pad = self.pad # convFilterActs will multiply targets by scaleTargets # then add scaleOutput * (the convolution value) # We could make use of this to implement an inplace # addconv op but for this op we just want to compute # the convolution so we set them to 0 and 1 respectively # Note: there is another version of convFilterActs that # does not take these arguments, but it is just a wrapper # around the version that does take them, so we save # a function call by using the version that we use. basic_setup = """ #define scaleTargets 0 #define scaleOutput 1 """ if self.dense_connectivity: basic_setup += """ #define numGroups 1 """ basic_setup += """ #define paddingStart (-%(pad)d) const int *hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s); const int hidGradsSizeY = hid_grads_dims[1]; const int hidGradsSizeX = hid_grads_dims[2]; const int numModules = hidGradsSizeX * hidGradsSizeY; int partialSum = %(partial_sum)d > 0 ? %(partial_sum)d : numModules; if (numModules %% partialSum > 0) { PyErr_Format(PyExc_ValueError, "partialSum must divide numModules, but partialSum=%%d and " "numModules=%%d", partialSum, numModules); %(fail)s; } """ if self.stride != 1: raise UnimplementedError() else: basic_setup += """ #define moduleStride 1 """ if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup += "#define WEIGHTACTS_COPY_NON_CONTIGUOUS 0\n" # The amount of braces that must be closed at the end num_braces = 0 # Convert images int nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_images = self._argument_contiguity_check("images") + """ if (%(images)s->nd != 4) { PyErr_Format(PyExc_ValueError, "images must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s); const int img_channels = images_dims[0]; if (img_channels > 3 && img_channels %% 4 != 0) { PyErr_Format(PyExc_ValueError, "images must have 3 or fewer channels, or have a multiple of 4 channels, got %%i", img_channels); %(fail)s; } { //setup_nv_images brace 2 const int * hid_grads_dims = CudaNdarray_HOST_DIMS(%(hid_grads)s); const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "weight_acts: nv_images"); """ num_braces += 2 # Convert hid_grads int nv_hid_grads, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_hid_grads = self._argument_contiguity_check("hid_grads") + """ if (%(hid_grads)s->nd != 4) { PyErr_Format(PyExc_ValueError, "hid_grads must have nd=4, got nd=%%i", %(hid_grads)s->nd); %(fail)s; } { //setup_nv_hid_grads brace 1 const int numFilters = hid_grads_dims[0]; const int batch_size = hid_grads_dims[3]; NVMatrix nv_hid_grads(%(hid_grads)s, numFilters * hidGradsSizeY * hidGradsSizeX, batch_size, "weight_acts:nv_hid_grads"); """ num_braces += 1 setup_nv_weights_grads = """ int filters_dims[4]; // filters: (input channels, filter rows, filter cols, output channels) filters_dims[0] = img_channels; filters_dims[1] = imgSizeY - hidGradsSizeY + 1 - 2 * paddingStart; filters_dims[2] = imgSizeX - hidGradsSizeX + 1 - 2 * paddingStart; assert(filters_dims[1] == filters_dims[2]); // only square kernels are supported filters_dims[3] = numFilters; const int filterSize = filters_dims[1]; int partialsum_storage_dims[5]; for (int i = 1; i < 5; i++) { partialsum_storage_dims[i] = filters_dims[i - 1]; } partialsum_storage_dims[0] = numModules / partialSum; CudaNdarray *partialsum_storage = NULL; if (partialSum != numModules && CudaNdarray_prep_output(&partialsum_storage, 5, partialsum_storage_dims)) { %(fail)s; } for (int i = 0; i < 4; i++) { if (filters_dims[i] <= 0) { printf("filters_dims[%%d] = %%d\\n", i, filters_dims[i]); assert(false); } } if (CudaNdarray_prep_output(& %(weights_grads)s, 4, filters_dims)) { Py_DECREF(partialsum_storage); %(fail)s; } { // setup_nv_weights_grad brace # 1 NVMatrix nv_weights_grads(%(weights_grads)s, filters_dims[0] * filterSize * filterSize, numFilters, "weight_acts:nv_weights_grads"); """ num_braces += 1 # note: imgSizeX is not specified here, it is computed internally # (in _filterActsSparse) by the lines: # int imgPixels = images.getNumRows() / numImgColors; # int imgSizeX = imgPixels / imgSizeY; # # note: numFilters is not specified here. it is determined by # nv_filters.getNumCols() # # note: the size of the filters is determined by dividing # nv_filters.getNumRows() by numFilterColors # run_kernel = """ if (partialSum == numModules) _weightActs(nv_images, nv_hid_grads, nv_weights_grads, imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize, paddingStart, moduleStride, img_channels, numGroups, partialSum, 0, 1); else { NVMatrix nv_partialsum(partialsum_storage, (numModules / partialSum) * filters_dims[0] * filterSize * filterSize, numFilters, "weight_acts: nv_partialsum"); _weightActs(nv_images, nv_hid_grads, nv_partialsum, imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize, paddingStart, moduleStride, img_channels, numGroups, partialSum, 0, 1); nv_partialsum.reshape((numModules / partialSum), filters_dims[0] * filterSize * filterSize * numFilters); // sum out axis 0 of nv_partialsum #define AXIS 0 // scale the contents of nv_weights_grads by 0 // i.e., clear out its pre-existing content #define SCALE_THIS 0 // scale the new sum by 1, i.e., don't do any scaling #define SCALE_SUM 1 nv_weights_grads.addSum(nv_partialsum, AXIS, SCALE_THIS, SCALE_SUM); Py_DECREF(partialsum_storage); } """ braces = '}' * num_braces rval = (basic_setup + setup_nv_images + setup_nv_hid_grads + setup_nv_weights_grads + run_kernel + braces) rval = render_string(rval, locals()) return rval
def c_code(self, node, name, inputs, outputs, sub): """ .. todo:: WRITEME """ images, seed = inputs targets, = outputs fail = sub['fail'] # The amount of braces that must be closed at the end num_braces = 0 if self.copy_non_contiguous: raise UnimplementedError() else: basic_setup = "#define STOCHASTICMAXPOOL_COPY_NON_CONTIGUOUS 0\n" # Convert images in nv_images, an NVMatrix, for compatibility # with the cuda-convnet functions setup_nv_images = self._argument_contiguity_check("images") + """ if (%(images)s->nd != 4) { PyErr_Format(PyExc_ValueError, "images must have nd=4, got nd=%%i", %(images)s->nd); %(fail)s; } { //setup_nv_images brace 1 const int * images_dims = CudaNdarray_HOST_DIMS(%(images)s); const int img_channels = images_dims[0]; const int imgSizeY = images_dims[1]; const int imgSizeX = images_dims[2]; const int batch_size = images_dims[3]; if(imgSizeY != imgSizeX){ PyErr_Format(PyExc_ValueError, "images must be square(dims[1] == dims[2]). Shape (%%i,%%i,%%i,%%i)", img_channels, imgSizeY, imgSizeX, batch_size); %(fail)s; } if(%(ds)s > imgSizeY){ PyErr_Format(PyExc_ValueError, "ds(%%d) must be <= imgSizeX(%%d) and imgSizeY(%%d).", %(ds)s, imgSizeX, imgSizeY); %(fail)s; } if(%(start)s >= imgSizeX){ PyErr_Format(PyExc_ValueError, "start is %%d but must be smaller then the images size of %%d x %%d.", %(start)s, imgSizeX, imgSizeY); %(fail)s; } NVMatrix nv_images(%(images)s, img_channels * imgSizeY * imgSizeX, batch_size, "MaxPool:nv_images"); //int * seed = CudaNdarray_HOST_DIMS%(seed)s; float * seed = CudaNdarray_DEV_DATA(%(seed)s); //int * seed = %(seed)s; """ num_braces += 1 setup_nv_targets = """ //int _outputsX = int(ceil((dic['imgSize'] - dic['start'] - dic['sizeX']) / float(dic['stride']))) + 1; int _outputsX = ((int)(ceil((imgSizeY - %(start)s - %(ds)s) / ((float)%(stride)s)))) + 1; int target_dims [] = { img_channels, _outputsX, _outputsX, batch_size }; if (CudaNdarray_prep_output(& %(targets)s, 4, target_dims)) { %(fail)s; } { // setup_nv_target brace # 1 NVMatrix nv_targets(%(targets)s, target_dims[0] * target_dims[1] * target_dims[2], target_dims[3], "MaxPool:nv_targets"); """ num_braces += 1 do_pool = """ convLocalStochasticMaxPool(nv_images, nv_targets, img_channels, %(ds)s, %(start)s, %(stride)s, _outputsX, MaxPooler(), seed); """ braces = '}' * num_braces rval = (basic_setup + setup_nv_images + setup_nv_targets + do_pool + braces) start = self.start stride = self.stride ds = self.ds rval = rval % locals() return rval