def make_node(self, x, y, *inputs): ctx_name = infer_context_name(x, y) x = as_gpuarray_variable(x, ctx_name) y = as_gpuarray_variable(y, ctx_name) rval = IncSubtensor.make_node(self, x, y, *inputs) ret = gof.Apply(self, [x, y] + rval.inputs[2:], [x.type()]) return ret
def make_node(self, inp1, inp2): if not cusolver_available: raise RuntimeError('CUSOLVER is not available and ' 'GpuCusolverSolve Op can not be constructed.') if skcuda.__version__ <= '0.5.1': warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8') context_name = basic_ops.infer_context_name(inp1, inp2) inp1 = basic_ops.as_gpuarray_variable(inp1, context_name) inp2 = basic_ops.as_gpuarray_variable(inp2, context_name) inp1 = basic_ops.gpu_contiguous(inp1) inp2 = basic_ops.gpu_contiguous(inp2) # this op can only operate on float32 matrices assert inp1.ndim == 2 assert inp2.ndim == 2 assert inp1.dtype == 'float32' assert inp2.dtype == 'float32' return theano.Apply( self, [inp1, inp2], [GpuArrayType('float32', broadcastable=inp1.broadcastable, context_name=context_name)()])
def make_node(self, inp1, inp2): if not cublas_available: raise RuntimeError( "CUBLAS is not available and " "GpuCublasTriangularSolve Op " "can not be constructed." ) context_name = infer_context_name(inp1, inp2) inp1 = as_gpuarray_variable(inp1, context_name) inp2 = as_gpuarray_variable(inp2, context_name) inp1 = gpu_contiguous(inp1) inp2 = gpu_contiguous(inp2) assert inp1.ndim == 2 assert inp2.ndim in [1, 2] assert inp1.dtype == inp2.dtype return theano.Apply( self, [inp1, inp2], [ GpuArrayType( inp1.dtype, broadcastable=inp2.broadcastable, context_name=context_name, )() ], )
def make_node(self, inp1, inp2): if not cusolver_available: raise RuntimeError( "CUSOLVER is not available and " "GpuCusolverSolve Op can not be constructed." ) if skcuda.__version__ <= "0.5.1": warnings.warn( "The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8" ) context_name = infer_context_name(inp1, inp2) inp1 = as_gpuarray_variable(inp1, context_name) inp2 = as_gpuarray_variable(inp2, context_name) inp1 = gpu_contiguous(inp1) inp2 = gpu_contiguous(inp2) assert inp1.ndim == 2 assert inp2.ndim == 2 assert inp1.dtype == inp2.dtype return theano.Apply( self, [inp1, inp2], [ GpuArrayType( inp1.dtype, broadcastable=inp1.broadcastable, context_name=context_name, )() ], )
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if not op.ignore_border: return inp, out_grad, ws, stride, pad = inputs nd = op.ndim if nd not in (2, 3): return inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) mode = op.mode # the GPU ops expect exactly 2 non-pooling dimensions if inp.ndim == nd + 2: # We reuse out_grad because cuDNN does not use the value of the `out` # argument but still checks its shape for average pooling. This # has been observed in v2 and v3 as far as I know. return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad) else: # reshape to 4D or 5D with 2 non-pooling dimensions inp_padded = pad_dims(inp, 2, nd) out_grad_padded = pad_dims(out_grad, 2, nd) ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad) return unpad_dims(ret_padded, inp, 2, nd)
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if not op.ignore_border: return inp, out, out_grad, ws, stride, pad = inputs nd = op.ndim if nd not in (2, 3): return inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) mode = op.mode # the GPU ops expect exactly 2 non-pooling dimensions if inp.ndim == nd + 2: return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad) else: # reshape to 4D or 5D with 2 non-pooling dimensions inp_padded = pad_dims(inp, 2, nd) out_padded = pad_dims(out, 2, nd) out_grad_padded = pad_dims(out_grad, 2, nd) ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_padded, out_grad_padded, ws, stride, pad) return unpad_dims(ret_padded, inp, 2, nd)
def make_node(self, x, y, ilist): """ It differs from GpuAdvancedIncSubtensor1 in that it makes sure the indexes are of type long. """ ctx_name = infer_context_name(x, y, ilist) x_ = as_gpuarray_variable(x, ctx_name) y_ = as_gpuarray_variable(y.astype(x.dtype), ctx_name) ilist_ = as_gpuarray_variable(ilist, ctx_name) assert x_.type.ndim >= y_.type.ndim if ilist_.type.dtype not in tt.integer_dtypes: raise TypeError("index must be integers") if ilist_.type.ndim != 1: raise TypeError("index must be vector") if x_.type.ndim == 0: raise TypeError("cannot index into a scalar") if y_.type.ndim > x_.type.ndim: if self.set_instead_of_inc: opname = "set" else: opname = "increment" raise TypeError( "cannot %s x subtensor with ndim=%s by y with ndim=%s " % (opname, x_.type.ndim, y_.type.ndim) ) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def make_node(self, x, y, ilist): ctx_name = infer_context_name(x, y) x_ = as_gpuarray_variable(x, ctx_name) y_ = as_gpuarray_variable(y, ctx_name) ilist_ = tt.as_tensor_variable(ilist) assert x_.type.ndim >= y_.type.ndim if ilist_.type.dtype not in tt.integer_dtypes: raise TypeError("index must be integers") if ilist_.type.ndim != 1: raise TypeError("index must be vector") if x_.type.ndim == 0: raise TypeError("cannot index into a scalar") if y_.type.ndim > x_.type.ndim: if self.set_instead_of_inc: opname = "set" else: opname = "increment" raise TypeError( "cannot %s x subtensor with ndim=%s by y with ndim=%s " % (opname, x_.type.ndim, y_.type.ndim) ) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def make_node(self, x, ilist): ctx_name = infer_context_name(x, ilist) x_ = as_gpuarray_variable(x, ctx_name) ilist__ = tt.as_tensor_variable(ilist) if ilist__.type.dtype not in tt.integer_dtypes: raise TypeError("index must be integers") if ilist__.type.dtype != "int64": ilist__ = tt.cast(ilist__, "int64") ilist_ = gpu_contiguous(as_gpuarray_variable(ilist__, ctx_name)) if ilist_.type.dtype != "int64": raise TypeError("index must be int64") if ilist_.type.ndim != 1: raise TypeError("index must be a vector") if x_.type.ndim == 0: raise TypeError("cannot index into a scalar") bcast = ilist_.broadcastable + x_.broadcastable[1:] return gof.Apply( self, [x_, ilist_], [GpuArrayType(dtype=x.dtype, context_name=ctx_name, broadcastable=bcast)()], )
def make_node(self, inp1, inp2): if not cusolver_available: raise RuntimeError('CUSOLVER is not available and ' 'GpuCusolverSolve Op can not be constructed.') if skcuda.__version__ <= '0.5.1': warnings.warn( 'The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8' ) context_name = basic_ops.infer_context_name(inp1, inp2) inp1 = basic_ops.as_gpuarray_variable(inp1, context_name) inp2 = basic_ops.as_gpuarray_variable(inp2, context_name) inp1 = basic_ops.gpu_contiguous(inp1) inp2 = basic_ops.gpu_contiguous(inp2) # this op can only operate on float32 matrices assert inp1.ndim == 2 assert inp2.ndim == 2 assert inp1.dtype == 'float32' assert inp2.dtype == 'float32' return theano.Apply(self, [inp1, inp2], [ GpuArrayType('float32', broadcastable=inp1.broadcastable, context_name=context_name)() ])
def make_node(self, d, x): d = as_gpuarray_variable(d, context_name=self.context_name) x = as_gpuarray_variable(x, context_name=self.context_name) assert d.ndim == 1 assert x.ndim == 1 broadcastable = (False,) otype = GpuArrayType(dtype='int64' if self.dtype_int64 else 'int32', broadcastable=broadcastable, context_name=self.context_name) return gof.Apply(self, [d, x], [otype()])
def make_node(self, x, y, *inputs): ctx_name = infer_context_name(x, y) rval = AdvancedIncSubtensor.make_node(self, x, y, *inputs) otype = GpuArrayType( dtype=rval.outputs[0].type.dtype, broadcastable=rval.outputs[0].type.broadcastable, context_name=ctx_name, ) x = as_gpuarray_variable(x, ctx_name) y = as_gpuarray_variable(y, ctx_name) return gof.Apply(self, [x, y] + rval.inputs[2:], [otype()])
def make_node(self, x, ilist): ctx_name = infer_context_name(x, ilist) x_ = as_gpuarray_variable(x, ctx_name) ilist_ = as_gpuarray_variable(ilist, ctx_name) if ilist_.type.dtype not in tensor.integer_dtypes: raise TypeError('index must be integers') if ilist_.type.ndim != 1: raise TypeError('index must be vector') if x_.type.ndim == 0: raise TypeError('cannot index into a scalar') return gof.Apply(self, [x_, ilist_], [x_.type()])
def make_node(self, inp1, inp2): ctx = infer_context_name(inp1, inp2) inp1 = gpu_ops.as_gpuarray_variable(inp1, ctx) inp2 = gpu_ops.as_gpuarray_variable(inp2, ctx) assert inp1.dtype == "float32" assert inp2.dtype == "float32" assert inp1.ndim == 2 assert inp2.ndim == 2 otype = GpuArrayType(dtype=inp1.dtype, broadcastable=(False, False), context_name=ctx) return Apply(self, [inp1, inp2], [otype()])
def make_node(self, inp, s=None): # A shape parameter is expected as an input. For now this is used to # manage odd transform sizes. # Later this could be extended to handle padding and trunkation, # following numpy's interface. However, cuFFT expects array that match # the shape given to the plan, so padding will have to be done in the op. # The effect of padding on gradients has yet to be investigated. if not skcuda_available: raise RuntimeError("skcuda is needed for CuIFFTOp") if not pygpu_available: raise RuntimeError("pygpu is needed for CuIFFTOp") if not pycuda_available: raise RuntimeError("pycuda is needed for CuIFFTOp") inp = gpu_contiguous(as_gpuarray_variable(inp, infer_context_name(inp))) # If no shape is provided as input, calculate shape assuming even real transform. if s is None: s = inp.shape[1:-1] s = tt.set_subtensor(s[-1], (s[-1] - 1) * 2) s = tt.as_tensor_variable(s) assert inp.dtype == "float32" assert s.ndim == 1 return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def local_cudnn_maxandargmax(node): if not isinstance(node.op, GpuMaxAndArgmax): return if not dnn_available(node.inputs[0].type.context_name): return if version(raises=False) < 6000: return if node.inputs[0].ndim > 8: return if node.inputs[0].dtype != node.outputs[0].dtype: return if node.inputs[0].dtype not in ["float16", "float32", "float64"]: return # order of the axes influences the output indices if node.op.axis is not None and tuple(sorted( node.op.axis)) != node.op.axis: return max, arg = GpuDnnReduction("maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True)(node.inputs[0]) # cudnn can only return int32 indices return ( max, as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name), )
def make_node(self, inp): if not cusolver_available: raise RuntimeError('CUSOLVER is not available and ' 'GpuLU Op can not be constructed.') if skcuda.__version__ <= '0.5.1': warnings.warn( 'The GpuLU op requires scikit-cuda > 0.5.1 to work with CUDA 8' ) if not pygpu_available: raise RuntimeError('Missing pygpu or triu/tril functions.' 'Install or update libgpuarray.') context_name = infer_context_name(inp) inp = as_gpuarray_variable(inp, context_name) inp = gpu_contiguous(inp) # this op can only operate on float32 matrices # because of current implementation of triu/tril. # TODO: support float64 assert inp.ndim == 2 assert inp.dtype == 'float32' # outputs LU in a single matrix, and a pivots array pivots_type = GpuArrayType('int32', broadcastable=inp[0].broadcastable, context_name=context_name)() return theano.Apply(self, [inp], [inp.type(), pivots_type])
def make_node(self, ten4, neib_shape, neib_step=None): ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4)) neib_shape = tt.as_tensor_variable(neib_shape) if neib_step is None: neib_step = neib_shape else: neib_step = tt.as_tensor_variable(neib_step) assert ten4.ndim == 4 assert neib_shape.ndim == 1 assert neib_step.ndim == 1 assert neib_shape.dtype in tt.integer_dtypes assert neib_step.dtype in tt.integer_dtypes return Apply( self, [ten4, neib_shape, neib_step], [ GpuArrayType( broadcastable=(False, False), dtype=ten4.type.dtype, context_name=ten4.type.context_name, )() ], )
def make_node(self, inp, s=None): # A shape parameter s can be provided as an input. For now this is used to # manage odd transform sizes. # Later this could be extended to handle padding and trunkation, # following numpy's interface. However, cuFFT expects array that match # the shape given to the plan, so padding will have to be done in the op. # The effect of padding on gradients has yet to be investigated. if not scikits_cuda_available: raise RuntimeError("skcuda is needed for CuFFTOp") if not pygpu_available: raise RuntimeError("pygpu is needed for CuFFTOp") if not pycuda_available: raise RuntimeError("pycuda is needed for CuFFTOp") inp = basic_ops.gpu_contiguous( basic_ops.as_gpuarray_variable(inp, basic_ops.infer_context_name(inp))) # If no shape is provided as input, default to input data shape. if s is None: s = inp.shape[1:] s = T.as_tensor_variable(s) assert inp.dtype == "float32" assert s.ndim == 1 assert 'int' in s.dtype return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_uv: return theano.Apply( self, [A], # return S, U, VT [ GpuArrayType( A.dtype, broadcastable=[False], context_name=ctx_name )(), A.type(), A.type(), ], ) else: return theano.Apply( self, [A], # return only S [GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()], )
def make_node(self, x, k=0): #TODO: dtype check x = as_gpuarray_variable(x, context_name=self.context_name) k = tensor.as_tensor_variable(k) assert x.ndim == 2 assert k.ndim == 0 broadcastable = (False,True) if self.keepdims else (False,) otype = GpuArrayType(dtype=x.type.dtype, broadcastable=broadcastable, context_name=self.context_name) return gof.Apply(self, [x, k], [otype()])
def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") return theano.Apply(self, [A], [A.type()])
def local_softmax_dnn(node): if isinstance(node.op, GpuSoftmax): if not dnn_available(node.outputs[0].type.context_name): return ins = node.inputs[0].dimshuffle(0, 1, "x", "x") ins = gpu_contiguous(ins) out = GpuDnnSoftmax("accurate", "channel")(ins) out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name) return [out]
def make_node(self, inp1, inp2): self.context = basic_ops.infer_context_name(inp1, inp2) inp1 = basic_ops.as_gpuarray_variable(inp1, self.context) inp2 = basic_ops.as_gpuarray_variable(inp2, self.context) inp1 = basic_ops.gpu_contiguous(inp1) inp2 = basic_ops.gpu_contiguous(inp2) # this op can only operate on float32 matrices assert inp1.ndim == 2 assert inp2.ndim == 2 assert inp1.dtype == 'float32' assert inp2.dtype == 'float32' return theano.Apply(self, [inp1, inp2], [ GpuArrayType('float32', broadcastable=inp1.broadcastable, context_name=self.context)() ])
def make_node(self, inp1, inp2): self.context = basic_ops.infer_context_name(inp1, inp2) inp1 = basic_ops.as_gpuarray_variable(inp1, self.context) inp2 = basic_ops.as_gpuarray_variable(inp2, self.context) inp1 = basic_ops.gpu_contiguous(inp1) inp2 = basic_ops.gpu_contiguous(inp2) # this op can only operate on float32 matrices assert inp1.ndim == 2 assert inp2.ndim == 2 assert inp1.dtype == 'float32' assert inp2.dtype == 'float32' return theano.Apply( self, [inp1, inp2], [GpuArrayType('float32', broadcastable=inp1.broadcastable, context_name=self.context)()])
def make_node(self, x, k=0, n=0, m=0): #TODO: dtype check x = as_gpuarray_variable(x, context_name=self.context_name) k = tensor.as_tensor_variable(k) n = tensor.as_tensor_variable(n) m = tensor.as_tensor_variable(m) assert x.ndim == 2 or x.ndim == 1 assert k.ndim == 0 assert n.ndim == 0 assert m.ndim == 0 otype = GpuArrayType(dtype=x.type.dtype, broadcastable=(False,False), context_name=self.context_name) return gof.Apply(self, [x, k, n, m], [otype()])
def make_node(self, x): x = as_gpuarray_variable(x, self.context_name) x_arg = pygpu.elemwise.arg('x', 'float32', read=True) c_arg = pygpu.elemwise.arg('c', 'float32', read=True, write=True) self.my_op = pygpu.elemwise.GpuElemwise( get_context(self.context_name), "c = " + str(self.a) + " * x + " + str(self.b), [x_arg, c_arg], convert_f16=True) return Apply(self, [x], [x.type()])
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return ins = [] for n in inputs: n = as_gpuarray_variable(n, ctx_name) if n.ndim != 2: return ins.append(n.dimshuffle(0, "x", 1, "x")) out = GpuDnnSoftmaxGrad("accurate", "instance")(gpu_contiguous(ins[0]), gpu_contiguous(ins[1])) return [out.dimshuffle(0, 2)]
def make_node(self, _x): ctx_name = infer_context_name(_x) x = as_gpuarray_variable(_x, ctx_name) if x.ndim < 2: raise ValueError("Diagonal needs an input with 2 or more " "dimensions", x) axis_small, axis_large = sorted((self.axis1, self.axis2)) broadcastable = ( x.broadcastable[:axis_small] + x.broadcastable[axis_small + 1 : axis_large] + x.broadcastable[axis_large + 1 :] + (False,) ) return gof.Apply(self, [x], [x.type.clone(broadcastable=broadcastable)()])
def make_node(self, activations, labels, input_lengths): context_name = infer_context_name(activations) t_activations = as_gpuarray_variable(activations, context_name=context_name) # Ensure activations array is C-contiguous t_activations = gpu_contiguous(t_activations) # Labels and input lengths are always on the CPU t_labels = tt.as_tensor_variable(labels) t_input_lengths = tt.as_tensor_variable(input_lengths) if t_activations.type.dtype != "float32": raise TypeError("activations must use the float32 type.") if t_activations.ndim != 3: raise ValueError("activations must have 3 dimensions.") if t_labels.type.dtype != "int32": raise TypeError("labels must use the int32 type.") if t_labels.ndim != 2: raise ValueError("labels must have 2 dimensions.") if t_input_lengths.type.dtype != "int32": raise TypeError("input_lengths must use the int32 type.") if t_input_lengths.ndim != 1: raise ValueError("input_lengths must have 1 dimension.") costs = GpuArrayType(dtype="float32", broadcastable=(False, ), context_name=context_name)() outputs = [costs] if self.compute_grad: gradients = GpuArrayType( dtype="float32", broadcastable=( False, False, False, ), context_name=context_name, )() outputs += [gradients] return theano.Apply(self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs)
def make_node(self, diag): ctx_name = infer_context_name(diag) diag = as_gpuarray_variable(diag, ctx_name) if diag.type.ndim < 1: raise ValueError( "AllocDiag needs an input with 1 or more " "dimensions", diag.type ) return gof.Apply( self, [diag], [ diag.type.__class__( dtype=diag.dtype, broadcastable=[False] * (diag.ndim + 1) )() ], )
def make_node(self, src, dest=None): if dest is None: inputs = [src] if self.inplace: self.inplace_pattern = {0: 0} else: self.inplace_pattern = {} else: inputs = [src, dest] self.inplace = True self.inplace_pattern = {0: 1} self.destroy_map = dict((o, [i]) for o, i in self.inplace_pattern.items()) inputs = [as_gpuarray_variable(i, self.worker.ctx_name) for i in inputs] if dest is not None: if not inputs[0].type == inputs[1].type: raise TypeError("`src` and `dest` must have the same Type:", (inputs[0].type, inputs[1].type)) out_type = inputs[0].type.clone() return theano.Apply(self, inputs, [out_type()])
def local_gpua_pool_dnn_alternative(fgraph, op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if not op.ignore_border: return img, ws, stride, pad = inputs nd = op.ndim if nd not in (2, 3): return img = gpu_contiguous(as_gpuarray_variable(img, ctx_name)) mode = op.mode # dnn_pool expects exactly 2 non-pooling dimensions if img.ndim == nd + 2: return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode) else: # reshape to 4D or 5D with 2 non-pooling dimensions img_padded = pad_dims(img, 2, nd) ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode) return unpad_dims(ret_padded, img, 2, nd)
def make_node(self, inp): if not cusolver_available: raise RuntimeError('CUSOLVER is not available and ' 'GpuCholesky Op can not be constructed.') if skcuda.__version__ <= '0.5.1': warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8') if not pygpu_available: raise RuntimeError('Missing pygpu or triu/tril functions.' 'Install or update libgpuarray.') context_name = basic_ops.infer_context_name(inp) inp = basic_ops.as_gpuarray_variable(inp, context_name) inp = basic_ops.gpu_contiguous(inp) # this op can only operate on float32 matrices # because of current implementation of triu/tril. # TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano. assert inp.ndim == 2 assert inp.dtype == 'float32' return theano.Apply(self, [inp], [inp.type()])
def make_node(self, x): ctx_name = infer_context_name(x) x = as_gpuarray_variable(x, ctx_name) return Apply(self, [x], [x.type()])