def get_native_compile_info(self, input_types, devtype): assert devtype == "gpu" d = dict(cdtype=core.np2c[input_types[0].dtype]) cuda_code=r""" #include "cgt_cuda.h" #include "lrn.cuh" void launchker_$function(int num_img, int channels, int height, int width, int localsize, double alpha, double beta, %(cdtype)s* Xdata, %(cdtype)s* topdata, %(cdtype)s* scalingdata, %(cdtype)s* topdiffdata, %(cdtype)s* bottomdiffdata) { int nblocks, nthreads; int size = num_img * width * height; cgt_get_bt(size, nblocks, nthreads); LRNComputeDiff<%(cdtype)s><<<nblocks, nthreads, 0>>>(size, (%(cdtype)s*)Xdata, (%(cdtype)s*)topdata, (%(cdtype)s*)scalingdata, (%(cdtype)s*)topdiffdata, num_img, channels, height, width, localsize, -beta, 2. * alpha * beta / localsize, (%(cdtype)s*)bottomdiffdata); CUDA_CHECK_ERROR("CrossChannelLRNBackward"); } """%d code = """ void launchker_$function(int num_img, int channels, int height, int width, int localsize, double alpha, double beta, %(cdtype)s* Xdata, %(cdtype)s* topdata, %(cdtype)s* scaledata, %(cdtype)s* topdiffdata, %(cdtype)s* bottomdiffdata); CGT_EXPORT_C void $function($closure* cldata, cgtArray** reads, cgtArray* bottom_diff) { cgtArray *X=reads[0], *top=reads[1], *scaling=reads[2], *top_diff=reads[3]; int num_img = X->shape()[0], channels = X->shape()[1], height = X->shape()[2], width = X->shape()[3]; launchker_$function(num_img, channels, height, width, cldata->localsize, cldata->alpha, cldata->beta, (%(cdtype)s*)X->data(), (%(cdtype)s*)top->data(), (%(cdtype)s*)scaling->data(), (%(cdtype)s*)top_diff->data(), (%(cdtype)s*)bottom_diff->data()); }"""%d return core.NativeCompileInfo(code, closure_triples = make_closure(self.info), link_flags="-lcudart", gpu_deref_mask=(True,True,True,True), extra_srcs=[core.SrcFile("cuda",cuda_code)])
def get_native_compile_info(self, input_types, devtype): assert devtype == "gpu" d = dict(cdtype=core.np2c[input_types[0].dtype]) cuda_code = r""" #include "cgt_cuda.h" #include "lrn.cuh" void launchker_$function(int num_img, int channels, int height, int width, int localsize, double alpha, double beta, %(cdtype)s* Xdata, %(cdtype)s* topdata, %(cdtype)s* scaledata) { int size = num_img * height * width; int nblocks, nthreads; cgt_get_bt(size, nblocks, nthreads); LRNFillScale<%(cdtype)s><<<nblocks, nthreads, 0>>>( size, Xdata, num_img, channels, height, width, localsize, alpha / localsize, scaledata); CUDA_CHECK_ERROR("LRNFillScale"); size = num_img * channels * width * height; cgt_get_bt(size, nblocks, nthreads); LRNComputeOutput<%(cdtype)s><<<nblocks, nthreads, 0>>>(size, Xdata, scaledata, -beta, topdata); CUDA_CHECK_ERROR("LRNComputeOutput"); }"""%d code = r""" extern void launchker_$function(int num_img, int channels, int height, int width, int localsize, double alpha, double beta, %(cdtype)s* Xdata, %(cdtype)s* topdata, %(cdtype)s* scaledata); CGT_EXPORT_C void $function($closure* cldata, cgtArray** reads, cgtTuple* write) { cgtArray* X = reads[0]; int num_img = X->shape()[0], channels = X->shape()[1], height = X->shape()[2], width = X->shape()[3]; cgtArray* top = (cgtArray*)write->getitem(0); cgtArray* scale = (cgtArray*)write->getitem(1); launchker_$function(num_img, channels, height, width, cldata->localsize, cldata->alpha, cldata->beta, (%(cdtype)s*)X->data(), (%(cdtype)s*)top->data(), (%(cdtype)s*)scale->data()); }"""%d return core.NativeCompileInfo(code, closure_triples = make_closure(self.info), link_flags="-lcudart", gpu_deref_mask=(True,), extra_srcs=[core.SrcFile("cuda",cuda_code)])
def get_native_compile_info(self, input_types, devtype): code = r""" CGT_EXPORT_C void $function(conv_closure* cl, cgtArray** reads, cgtTuple* write) { max_pool<%(cdtype)s>(cl, reads[0], static_cast<cgtArray*>(write->getitem(0)), static_cast<cgtArray*>(write->getitem(1))); }""" % dict(cdtype=core.np2c[input_types[0].dtype]) return core.NativeCompileInfo(code, closure_triples=info2closure(self.info), includes=["pooling.h"])
def get_native_compile_info(self, input_types, devtype): code = r""" CGT_EXPORT_C void $function(conv_closure* cl, cgtArray** reads, cgtArray* write) { max_pool_pullback<%(cdtype)s>(reads[0], reads[1], reads[2], reads[3], write); }""" % dict(cdtype=core.np2c[input_types[0].dtype]) return core.NativeCompileInfo(code, closure_triples=info2closure(self.info), includes=["pooling.h"])
def get_native_compile_info(self, input_types, devtype): assert devtype == "gpu" code = """ CGT_EXPORT_C void $setup(conv_closure* closure) {setup_cudnn(closure);} CGT_EXPORT_C void $teardown(conv_closure* closure) {teardown_cudnn(closure);} CGT_EXPORT_C void $function(conv_closure* closure, cgtArray** reads, cgtArray* write) { if (!closure->handle) setup_cudnn(closure); performConvBackwardData(closure, reads[1], reads[2], write); }""" return core.NativeCompileInfo(code, closure_triples=make_closure( self.ph, self.pw, self.sv, self.sh), includes=["cudnn_support.h"], link_flags="-lcudnn -lcudart")
def get_native_compile_info(self, _input_types, devtype): assert devtype == "gpu" code = """ CGT_EXPORT_C void $setup(pooling_closure* closure) {setup_cudnn(closure);} CGT_EXPORT_C void $teardown(pooling_closure* closure) {teardown_cudnn(closure);} CGT_EXPORT_C void $function(pooling_closure* closure, cgtArray** reads, cgtArray* write) { if (!closure->handle) setup_cudnn(closure); performPoolingForward(closure, reads[0], write); }""" return core.NativeCompileInfo(code, closure_triples=poolinfo2closure( self.info), includes=["cudnn_support.h"], link_flags="-lcudnn -lcudart")
def get_native_compile_info(self, input_types, devtype): assert devtype == "cpu" code = """ CGT_EXPORT_C void $function(void* cldata, cgtArray** reads, cgtTuple* write) { float* x = static_cast<float*>(reads[0]->data()); float* y = static_cast<float*>(static_cast<cgtArray*>(write->getitem(0))->data()); float* z = static_cast<float*>(static_cast<cgtArray*>(write->getitem(1))->data()); for (int i=0; i < reads[0]->size(); ++i) { y[i] = sinf(x[i]); z[i] = cosf(x[i]); } }""" return core.NativeCompileInfo(code, includes=["math.h"], link_flags="-lm")
def get_native_compile_info(self, input_types, devtype): d = dict(cdtype=core.np2c[input_types[0].dtype]) d.update(self.info._asdict()) code = r""" CGT_EXPORT_C void $function($closure* cl, cgtArray** reads, cgtArray* write) { cgtArray* col = reads[0]; size_t batchsize = reads[1]->at<size_t>(0), channels = reads[2]->at<size_t>(0), height = reads[3]->at<size_t>(0), width = reads[4]->at<size_t>(0); for (int i=0; i < batchsize; ++i) { col2im_cpu<%(cdtype)s, %(kernel_h)s,%(kernel_w)s,%(pad_h)s,%(pad_w)s,%(stride_h)s,%(stride_w)s> ((%(cdtype)s*)col->data() + col->stride(0)*i, channels, height, width,(%(cdtype)s*)write->data() + write->stride(0)*i); } }""" % d return core.NativeCompileInfo(code, includes=["im2col.h"], closure_triples=info2closure(self.info))
def get_native_compile_info(self, input_types, devtype): assert devtype == "cpu" d = dict(cdtype=core.np2c[input_types[0].dtype]) d.update(self.info._asdict()) code = r""" CGT_EXPORT_C void $function($closure* cl, cgtArray** reads, cgtArray* write) { cgtArray* im = reads[0]; const size_t* imshape = im->shape(); int batchsize = imshape[0], channels = imshape[1], height = imshape[2], width = imshape[3]; for (int i=0; i < batchsize; ++i) { im2col_cpu<%(cdtype)s, %(kernel_h)s,%(kernel_w)s,%(pad_h)s,%(pad_w)s,%(stride_h)s,%(stride_w)s> ((%(cdtype)s*)im->data() + im->stride(0)*i, channels, height, width, (%(cdtype)s*)write->data() + write->stride(0)*i); } }""" % d return core.NativeCompileInfo(code, includes=["im2col.h"], closure_triples=info2closure(self.info))