def make_node(self, data, rois): data = as_cuda_ndarray_variable(data) rois = as_cuda_ndarray_variable(rois) assert data.ndim == 4 assert rois.ndim == 2 return Apply(self, [data, rois], [data.type(), data.type()])
def local_gpu_advanced_incsubtensor1_scal_floats(node): supported_dims = { # x.ndim, y.ndim (1, 0): GpuAdvancedIncSubtensor1Floats_scal_dev20, (2, 2): GpuAdvancedIncSubtensor1Floats_dev20, } if isinstance(node.op, GpuFromHost): host_input = node.inputs[0] # Should not execute for GpuAdvancedIncSubtensor1 if host_input.owner and \ host_input.owner.op.__class__ is AdvancedIncSubtensor1Floats: x, y = host_input.owner.inputs[0:2] dims = (x.ndim, y.ndim) if dims not in supported_dims.keys(): return False coords = host_input.owner.inputs[2:] set_instead_of_inc = host_input.owner.op.set_instead_of_inc inplace = host_input.owner.op.inplace gpu_op = supported_dims[dims]( inplace=inplace, set_instead_of_inc=set_instead_of_inc) return [ gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords) ] # Should not execute for GpuAdvancedIncSubtensor1 if (node.op.__class__ is AdvancedIncSubtensor1Floats and node.inputs[0].dtype == "float32" and node.inputs[1].dtype == "float32" and node.inputs[2].dtype == "float32"): x, y = node.inputs[0:2] dims = (x.ndim, y.ndim) if dims not in supported_dims: return False coords = node.inputs[2:] go_gpu = False if x.owner and isinstance(x.owner.op, HostFromGpu): go_gpu = True gpu_x, = x.owner.inputs else: gpu_x = as_cuda_ndarray_variable(x) if y.owner and isinstance(y.owner.op, HostFromGpu): go_gpu = True gpu_y, = y.owner.inputs else: gpu_y = as_cuda_ndarray_variable(y) if go_gpu: set_instead_of_inc = node.op.set_instead_of_inc inplace = node.op.inplace gpu_op = supported_dims[dims]( inplace=inplace, set_instead_of_inc=set_instead_of_inc) return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))] return False
def make_node(self, img, kern): img = as_cuda_ndarray_variable(img) kern = as_cuda_ndarray_variable(kern) if img.type.ndim != 4: raise TypeError('img must be 4D tensor') if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], False, False] return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def make_node(self, data, rois, argmaxes, out_grad): data = as_cuda_ndarray_variable(data) rois = as_cuda_ndarray_variable(rois) argmaxes = as_cuda_ndarray_variable(argmaxes) out_grad = as_cuda_ndarray_variable(out_grad) assert data.ndim == 4 assert rois.ndim == 2 assert argmaxes.ndim == 4 assert out_grad.ndim == 4 return Apply(self, [data, rois, argmaxes, out_grad], [data.type()])
def make_node(self, bottom0, bottom1): bottom0 = as_cuda_ndarray_variable(bottom0) bottom1 = as_cuda_ndarray_variable(bottom1) assert bottom0.ndim == 4 assert bottom1.ndim == 4 return Apply( self, [bottom0, bottom1], [bottom0.type(), bottom0.type(), bottom0.type()])
def make_node(self, bottom0, bottom1, rbot0, rbot1, out_grad): bottom0 = as_cuda_ndarray_variable(bottom0) bottom1 = as_cuda_ndarray_variable(bottom1) rbot0 = as_cuda_ndarray_variable(rbot0) rbot1 = as_cuda_ndarray_variable(rbot1) out_grad = as_cuda_ndarray_variable(out_grad) assert bottom0.ndim == 4 assert bottom1.ndim == 4 assert rbot0.ndim == 4 assert rbot1.ndim == 4 assert out_grad.ndim == 4 return Apply(self, [bottom0, bottom1, rbot0, rbot1, out_grad], [bottom0.type(), bottom0.type()])
def make_node(self, input): input = gpu_contiguous(as_cuda_ndarray_variable(input)) self.destructive = True assert input.dtype == "float32" assert input.ndim == 3 # (batch, a, b) return theano.Apply(self, [input], [self.output_type(input)()])
def make_node(self, cond, ift, iff): if any(ift.broadcastable) or any(iff.broadcastable): raise ValueError( "GPURowSwitch cannot operate on broadcastable " "output arguments (ift %s, iff %s)." % ift.broadcastable, iff.broadcastable) out_type = ift.dtype cond = as_cuda_ndarray_variable(T.cast(cond.flatten(), "float32")) ift = as_cuda_ndarray_variable(ift) iff = as_cuda_ndarray_variable(iff) assert ift.type.dtype == iff.type.dtype assert cond.ndim == 1, cond.ndim assert ift.ndim == iff.ndim return theano.gof.Apply(self, [cond, ift, iff], [ CudaNdarrayType(broadcastable=ift.broadcastable, dtype=out_type)() ])
def make_node(self, cond, ift, iff): if any(ift.broadcastable) or any(iff.broadcastable): raise ValueError( "GpuMaskedCAReduce cannot operate on " "broadcastable output arguments (ift %s, iff %s)." % ift.broadcastable, iff.broadcastable) out_type = ift.dtype cond = as_cuda_ndarray_variable(T.cast(cond.flatten(), "float32")) ift = as_cuda_ndarray_variable(ift) iff = as_cuda_ndarray_variable(iff) # TODO check contiguous? assert ift.type.dtype == iff.type.dtype assert cond.ndim == 1, cond.ndim assert ift.ndim == iff.ndim out_bcast = ift.broadcastable[1:] return theano.gof.Apply( self, [cond, ift, iff], [CudaNdarrayType(broadcastable=out_bcast, dtype=out_type)()])
def local_gpua_row_switch(node): """ Detects eligible Switch instances and replaces them with a GPU row switch. """ if (node.op.__class__ == T.Elemwise and node.op.scalar_op.__class__ != theano.scalar.Switch): return False cond, ift, iff = node.inputs out, = node.outputs # Only applies to Switch instances where a vector mask broadcasts over # matrices. bcast = cond.broadcastable if not bcast or not (not bcast[0] and all(bcast[1:]) and ift.ndim in [2, 3]): return False if not (ift.dtype == iff.dtype == "float32"): return False if cond.owner and isinstance(cond.owner.op, HostFromGpu): gpu_cond, = cond.owner.inputs else: gpu_cond = as_cuda_ndarray_variable(T.cast(cond.flatten(), "float32")) if ift.owner and isinstance(ift.owner.op, HostFromGpu): gpu_ift, = ift.owner.inputs else: gpu_ift = as_cuda_ndarray_variable(ift) if iff.owner and isinstance(iff.owner.op, HostFromGpu): gpu_iff, = iff.owner.inputs else: gpu_iff = as_cuda_ndarray_variable(iff) gpu_op = GpuRowSwitch() return [HostFromGpu()(gpu_op(cond, gpu_ift, gpu_iff))]
def make_node(self, x, y, ilist): x_ = as_cuda_ndarray_variable(x) y_ = as_cuda_ndarray_variable(y) ilist_ = gpu_contiguous(T.cast(ilist, config.floatX)) assert x_.type.dtype == y_.type.dtype assert x_.type.ndim >= y_.type.ndim #if ilist_.type.dtype[:3] not in ('int', 'uin'): # raise TypeError('index must be integers') if ilist_.type.ndim != 1: raise TypeError('index must be vector') if x_.type.ndim == 0: raise TypeError('cannot index into a scalar') if y_.type.ndim > x_.type.ndim: if self.set_instead_of_inc: opname = 'set' else: opname = 'increment' raise TypeError('cannot %s x subtensor with ndim=%s' ' by y with ndim=%s' % (opname, x_.type.ndim, y_.type.ndim)) return theano.gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def make_node(self, x, ilist): x_ = as_cuda_ndarray_variable(x) ilist_ = gpu_contiguous(T.cast( ilist, dtype=config.floatX)) # T.as_tensor_variable(ilist) #if ilist_.type.dtype[:3] not in ('int', 'uin'): # raise TypeError('index must be integers') if ilist_.type.ndim != 1: raise TypeError('index must be vector') if x_.type.ndim == 0: raise TypeError('cannot index into a scalar') # # c code suppose it is int64 # if x.ndim in [1, 2, 3] and ilist_.dtype in [ # 'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']: # ilist_ = tensor.cast(ilist_, 'int64') bcast = (ilist_.broadcastable[0], ) + x_.broadcastable[1:] return theano.gof.Apply( self, [x_, ilist_], [CudaNdarrayType(dtype=x.dtype, broadcastable=bcast)()])
def local_gpu_join_unsafe(node): """ Inspired by the opt for convop. Very loose notation follows. Subgraphs concerned first look like [array of HostTensor] -> HostToGpu -> GpuToHost -> Join -> HostToGpu -> GpuToHost First we apply this Opt: join(host_from_gpu) -> host_from_gpu(gpu_join) then, as an intermediate result, there should be host_from_gpu(gpu_join) -> HostToGpu -> GpuToHost this unnecessary GpuToHost -> HostToGpu should be removed by other opts, leaving us with host_from_gpu(gpu_join) For intermediate places in the graph not covered by the first opt, the following could be useful: gpu_from_host(join) -> gpu_join(gpu_from_host) not implemented yet. """ if isinstance(node.op, JoinUnsafe): # optimizing this case: # join(host_from_gpu) -> host_from_gpu(gpu_join) axis_and_tensors = node.inputs matches = [ t.dtype == 'float32' and ((t.owner is not None and isinstance(t.owner.op, HostFromGpu)) or isinstance(t, theano.gof.Constant)) for t in axis_and_tensors[1:] ] if all(matches): new_tensors = [ as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:] ] new_a_and_t = [axis_and_tensors[0]] + new_tensors replacement_node = host_from_gpu(GpuJoinUnsafe()(*new_a_and_t)) return [replacement_node]
def local_gpu_advanced_subtensor1_floats(node): if isinstance(node.op, GpuFromHost): host_input = node.inputs[0] if host_input.owner and \ host_input.owner.op.__class__ is AdvancedSubtensor1Floats: x = host_input.owner.inputs[0] coords = host_input.owner.inputs[1:] return [ GpuAdvancedSubtensor1Floats(host_input.owner.op._tag)( as_cuda_ndarray_variable(x), *coords) ] if node.op.__class__ is AdvancedSubtensor1Floats: x = node.inputs[0] coords = node.inputs[1:] # print x.owner.op, x.type, node.op._tag # DEV if (x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32"): gpu_x, = x.owner.inputs return [ host_from_gpu( GpuAdvancedSubtensor1Floats(node.op._tag)(gpu_x, *coords)) ] return False