def generate_kernel(self, node, nodename): inps = [ make_argument(i, 'i%d' % (n, )) for n, i in enumerate(node.inputs) ] scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs] outs = [ make_argument(o, 'o%d' % (n, )) for n, o in enumerate(node.outputs) if not n in self.inplace_pattern ] scal_out = [scalar.Scalar(o.dtype) for o in node.outputs] fake_node = Apply(self.scalar_op, [i() for i in scal_ins], [o() for o in scal_out]) try: code = self.scalar_op.c_support_code_apply(fake_node, nodename) if code: raise SupportCodeError(code) except MethodNotDefined: pass support_code = "" try: support_code = self.scalar_op.c_support_code() except MethodNotDefined: pass if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and support_code.strip() != ""): # The macro is fine, the C++ struct is not. raise SupportCodeError(support_code) scal_out = [] oi = 0 for n in range(len(fake_node.outputs)): if n in self.inplace_pattern: scal_out.append(inps[self.inplace_pattern[n]].name + '[i]') else: scal_out.append(outs[oi].name + '[i]') oi += 1 kop = self.scalar_op.c_code(fake_node, nodename + '_scalar', [i.name + '[i]' for i in inps], scal_out, dict(fail='return;')) # Translate types for scalar composite ops (except complex). support_code += """ #define npy_float64 ga_double #define npy_float32 ga_float #define npy_uint8 ga_ubyte #define npy_int8 ga_byte #define npy_uint16 ga_ushort #define npy_int16 ga_short #define npy_uint32 ga_uint #define npy_int32 ga_int #define npy_uint64 ga_ulong #define npy_int64 ga_long """ return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
def make_node(self, *inputs): res = Elemwise.make_node(self, *inputs) outputs = [GpuArrayType(broadcastable=o.type.broadcastable, dtype=o.type.dtype)() for o in res.outputs] inputs = [as_gpuarray_variable(i) for i in inputs] node = Apply(self, inputs, outputs) # Try to generate the kernel to catch SupportCodeErrors try: inps = [make_argument(i, 'i%d' % (n,)) for n, i in enumerate(node.inputs)] scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs] outs = [make_argument(o, 'o%d' % (n,)) for n, o in enumerate(node.outputs) if not n in self.inplace_pattern] scal_out = [scalar.Scalar(o.dtype) for o in node.outputs] fake_node = Apply(self.scalar_op, [i() for i in scal_ins], [o() for o in scal_out]) code = self.scalar_op.c_support_code_apply(fake_node, "test") if code: raise SupportCodeError(code) except MethodNotDefined: pass try: support_code = self.scalar_op.c_support_code() if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and support_code.strip() != ""): # The macro is fine, the C++ struct is not. raise SupportCodeError(support_code) except MethodNotDefined: pass return node
def task_code(d): print >> sio, self.scalar_op.c_code( Apply(self.scalar_op, [scalar.Scalar(dtype = input.type.dtype)() for input in node.inputs], [scalar.Scalar(dtype = output.type.dtype)() for output in node.outputs]) , nodename + '_scalar_' , ['i%i_data_%i[0]'%(ipos,d) for ipos, i in enumerate(node.inputs)] , ['o%i_data_%i[0]'%(ipos,d) for ipos, i in enumerate(node.outputs)] , sub=dict(fail='return;')) #TODO: set a failure code somehow!!!
def generate_kernel(self, node, nodename): inps = [make_argument(i, 'i%d' % (n,)) for n, i in enumerate(node.inputs)] scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs] outs = [make_argument(o, 'o%d' % (n,)) for n, o in enumerate(node.outputs) if not n in self.inplace_pattern] scal_out = [scalar.Scalar(o.dtype) for o in node.outputs] fake_node = Apply(self.scalar_op, [i() for i in scal_ins], [o() for o in scal_out]) scal_out = [] oi = 0 for n in range(len(node.outputs)): if n in self.inplace_pattern: scal_out.append(inps[self.inplace_pattern[n]].name+'[i]') else: scal_out.append(outs[oi].name+'[i]') oi += 1 kop = self.scalar_op.c_code(fake_node, nodename+'_scalar', [i.name+'[i]' for i in inps], scal_out, dict(fail='return;')) # Translate types for scalar composite ops (except complex). support_code = """ #ifdef _MSC_VER #define signed __int8 int8_t #define unsigned __int8 uint8_t #define signed __int16 int16_t #define unsigned __int16 uint16_t #define signed __int32 int32_t #define unsigned __int32 uint32_t #define signed __int64 int64_t #define unsigned __int64 uint64_t #else #include <stdint.h> #endif #define ga_bool uint8_t #define ga_byte int8_t #define ga_ubyte uint8_t #define ga_short int16_t #define ga_ushort uint16_t #define ga_int int32_t #define ga_uint uint32_t #define ga_long int64_t #define ga_ulong uint64_t #define ga_float float #define ga_double double #define ga_half uint16_t #include <Python.h> #include <numpy/npy_common.h> """ return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_src_kernel_Ccontiguous(self, node, nodename): nd = node.outputs[0].type.ndim sio = StringIO.StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename) #declare inputs for ipos, i in enumerate(node.inputs): print >> sio, "\t,", "const float * i%i_data" % ipos #declare outputs for ipos, i in enumerate(node.outputs): print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;" print >> sio, " const int numThreads = blockDim.x * gridDim.x;" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " const float ii_i%i_value = i%i_data[0];" % ( ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [ scalar.Scalar(dtype=input.type.dtype)() for input in node.inputs ], [ scalar.Scalar(dtype=output.type.dtype)() for output in node.outputs ]), nodename + '_scalar_' #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)] , get_str_list_logical_scalar(node, data_str='i%i_data[i]'), ['o%i_data[i]' % ipos for ipos, i in enumerate(node.outputs)], sub=dict(fail='return;')) #TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" print >> sio, "}" #print sio.getvalue() return sio.getvalue()
def PRelu(x): out_dtype = scalar.upgrade_to_float( scalar.Scalar(dtype=x.dtype))[0].dtype a = T.constant(0.625, dtype=out_dtype) b = T.constant(0.375, dtype=out_dtype) # x = (x * slope) + shift y = x * a + abs(x) * b r = T.clip(y, 0, 1) return r
def Relu(x): out_dtype = scalar.upgrade_to_float( scalar.Scalar(dtype=x.dtype))[0].dtype a = T.constant(0.5, dtype=out_dtype) # ab = T.constant(abs(x), dtype=out_dtype) # x = (x * slope) + shift y = (x + abs(x)) * a r = T.clip(y, 0, 1) return r
def safe_new(x, tag='', dtype=None): """ Internal function that constructs a new variable from x with the same type, but with a different name (old name + tag). This function is used by gradient, or the R-op to construct new variables for the inputs of the inner graph such that there is no interference between the original graph and the newly constructed graph. """ if hasattr(x, 'name') and x.name is not None: nw_name = x.name + tag else: nw_name = None if isinstance(x, theano.Constant): if dtype and x.dtype != dtype: casted_x = x.astype(dtype) nwx = x.__class__(casted_x.type, x.data, x.name) nwx.tag = copy(x.tag) return nwx else: return x.clone() # Note, as_tensor_variable will convert the Scalar into a # TensorScalar that will require a ScalarFromTensor op, # making the pushout optimization fail elif isinstance(x, scalar.ScalarVariable): if dtype: nw_x = scalar.Scalar(dtype=dtype)() else: nw_x = x.type() nw_x.name = nw_name return nw_x else: try: x = tensor.as_tensor_variable(x) except TypeError: # This could happen for example for random states, and I really # want to avoid the convoluted logic that checks for cuda # ndarrays pass nw_x = x.type() if dtype and nw_x.dtype != dtype: nw_x = nw_x.astype(dtype).type() nw_x.name = nw_name # Preserve test values so that the 'compute_test_value' option can be used. # The test value is deep-copied to ensure there can be no interactions # between test values, due to inplace operations for instance. This may # not be the most efficient memory-wise, though. if theano.config.compute_test_value != 'off': try: nw_x.tag.test_value = copy.deepcopy(gof.op.get_test_value(x)) except AttributeError: # This means `x` has no test value. pass return nw_x
def hard_sigmoid(x): """An approximation of sigmoid. More approximate and faster than ultra_fast_sigmoid. Approx in 3 parts: 0, scaled linear, 1 Removing the slope and shift does not make it faster. """ # Use the same dtype as determined by "upgrade_to_float", # and perform computation in that dtype. out_dtype = scalar.upgrade_to_float(scalar.Scalar(dtype=x.dtype))[0].dtype slope = tensor.constant(0.2, dtype=out_dtype) shift = tensor.constant(0.5, dtype=out_dtype) x = (x * slope) + shift x = tensor.clip(x, 0, 1) return x
def to_scalar_type(self): return scal.Scalar(dtype=self.dtype)
from theano import scalar from theano import function s0 = scalar.Scalar(dtype='float32')(name='s0') s1 = scalar.Scalar(dtype='float32')(name='s1') mx0 = scalar.maximum(s0, s1) mx0.name = 'mx0' mx1 = scalar.maximum(mx0, s0) mx1.name = 'mx1' E = scalar.second(mx1, 1) E.name = 'E' D = scalar.eq(mx1, mx0) D.name = 'D' C = D * E C.name = 'C' B = scalar.eq(mx0, s1) B.name = 'B' A = B * C A.name = 'A' function([s0, s1], [A, D])
def c_init_code(self): return scal.Scalar(self.dtype).c_init_code()
def c_code_cache_version(self): scalar_version = scal.Scalar(self.dtype).c_code_cache_version() if scalar_version: return (10, ) + scalar_version else: return ()
def c_compile_args(self): return scal.Scalar(self.dtype).c_compile_args()
def c_support_code(self): """Override `CLinkerObject.c_support_code` """ return scal.Scalar(self.dtype).c_support_code()
def c_libraries(self): return scal.Scalar(self.dtype).c_libraries()
def c_headers(self): """Override `CLinkerObject.c_headers` """ return scal.Scalar(self.dtype).c_headers()
def c_src_kernel_tiling(self, node, nodename): """ The kernel applies to problems with <= 5 dimensions """ #The kernel is intended to be structured roughly like this: """ static __global__ void kernel() { for (int v = blockIdx.y; v < dim0; v += gridDim.x) { for (int w = blockIdx.y; w < dim1; w += gridDim.y) { for (int x = threadIdx.x; x < dim2; x += blockDim.x) { for (int y = threadIdx.y; y < dim3; y += blockDim.y) { for (int z = threadIdx.z; z < dim4; z += blockDim.z) { out[v * out_stride[0] + ...] = f(in1[...], in2[...]) } } } } } } """ nd = node.outputs[0].type.ndim sio = StringIO() #print 'C_SRC_KERNEL', sio.getvalue() if nd in (4, ): # print some leading comments to make the code easier to read for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, 'tiling%i' % nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable print >> sio, " __shared__ float value0[%i];" % len(node.inputs) print >> sio, " __shared__ int shared_dims[%(nd)s];" % locals() #print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]" print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos) for ipos in xrange(nd): print >> sio, " shared_dims[%i] = dim%i;" % (ipos, ipos) print >> sio, " }" print >> sio, " __syncthreads();" if (nd == 4): print >> sio, """ for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x) { for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y) { //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x) for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y) { //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y) for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x) { """ else: raise NotImplementedError() for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % ( ipos, ipos) for ipos, i in enumerate(node.outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % ( ipos, ipos) for d in xrange(nd): for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % ( ipos, d, ipos, d) for ipos, i in enumerate(node.outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % ( ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [ scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs ], [ scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs ]), nodename + '_scalar_', get_str_list_logical_scalar(node, value_str='value0[%i]'), [ 'ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs) ], sub=dict(fail='return;')) #TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" * nd #TODO: insert runtime stride checks that select the best loop order either here, or in # the host code that launched the kernel (host code probably better spot) #indent = " "*(4*d+7) #for ipos, i in enumerate(node.inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" print sio.getvalue() return sio.getvalue()
def make_node(self, *inputs): _inputs = [as_gpuarray_variable(i) for i in inputs] if self.nin > 0 and len(_inputs) != self.nin: raise TypeError("Wrong argument count", (self.nin, len(_inputs))) for i in _inputs[1:]: if i.type.ndim != inputs[0].type.ndim: raise TypeError('mismatched rank amongst inputs') broadcastable = [] for d in xrange(_inputs[0].type.ndim): bcast_d = True for i in _inputs: if not i.type.broadcastable[d]: bcast_d = False break broadcastable.append(bcast_d) assert len(broadcastable) == _inputs[0].type.ndim assert self.nout > 0 inps = [make_argument(i, 'i%d' % (n, )) for n, i in enumerate(inputs)] scal_ins = [scalar.Scalar(i.dtype) for i in inputs] res = Apply(self, _inputs, [ GpuArrayType(o.dtype, broadcastable)() for o in self.scalar_op.output_types(scal_ins) ]) outs = [ make_argument(o, 'o%d' % (n, )) for n, o in enumerate(res.outputs) ] scal_out = [scalar.Scalar(o.dtype) for o in res.outputs] fake_node = Apply(self.scalar_op, [i() for i in scal_ins], [o() for o in scal_out]) kcode = self.scalar_op.c_code(fake_node, 'kcode', [i.expr() for i in inps], [o.expr() for o in outs], sub=dict(fail='return;')) res.tag.kcode = kcode try: code = self.scalar_op.c_support_code_apply(fake_node, 'kcode') if code: raise SupportCodeError() except MethodNotDefined: pass support_code = "" try: support_code += self.scalar_op.c_support_code() except MethodNotDefined: pass if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)": # Avoid the C++ complex struct raise SupportCodeError() k = ElemwiseKernel(None, inps + outs, kcode, preamble=support_code) res.tag.kernel = k return res
def c_src_kernel(self, node, nodename, nd): sio = StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data" % ipos] + ["int i%i_str_%i" % (ipos, d) for d in xrange(nd)]) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data" % ipos] + ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)]) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;" print >> sio, " const int numThreads = blockDim.x * gridDim.x;" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " const float ii_i%i_value = i%i_data[0];" % ( ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # calculate the data pointers for all arguments print >> sio, " int ii = i;" for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % ( ipos, ipos) for ipos, i in enumerate(node.outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos) for d in xrange(nd - 1, -1, -1): if d > 0: print >> sio, " int pos%i = ii %% dim%i;" % (d, d) print >> sio, " ii = ii / dim%i;" % d else: print >> sio, " int pos%i = ii;" % d for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % ( ipos, d, ipos, d) for ipos, i in enumerate(node.outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % ( ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [ scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs ], [ scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs ]), nodename + '_scalar_', get_str_list_logical_scalar(node), ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)], sub=dict(fail='return;')) # TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" #indent = " "*(4*d+7) #for ipos, i in enumerate(node.inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" #print sio.getvalue() return sio.getvalue()