def local_dot_to_gemm16(node): A = node.inputs[0] B = node.inputs[1] if (A.ndim == 2 and B.ndim == 2 and A.dtype == 'float16' and B.dtype == 'float16'): fgraph = node.inputs[0].fgraph C = GpuAllocEmpty(dtype='float16')( shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) return Gemm16()(C, 1.0, A, B, 0.0)
def local_dot_to_gemm16(node, ctx_name): if nerv is None: return A = node.inputs[0] B = node.inputs[1] if (A.ndim == 2 and B.ndim == 2 and A.dtype == 'float16' and B.dtype == 'float16'): fgraph = node.inputs[0].fgraph C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)( shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) return Gemm16()(C, 1.0, A, B, 0.0)
def local_dot_to_gemm16(node): if nerv is None: return A = node.inputs[0] B = node.inputs[1] if (A.ndim == 2 and B.ndim == 2 and A.dtype == 'float16' and B.dtype == 'float16'): fgraph = node.inputs[0].fgraph C = GpuAllocEmpty(dtype='float16')( shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) return Gemm16()(C, 1.0, A, B, 0.0)
def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs): if nerv is None: return A = inputs[0] B = inputs[1] if (A.ndim == 2 and B.ndim == 2 and A.dtype == 'float16' and B.dtype == 'float16'): fgraph = getattr(outputs[0], 'fgraph', None) C = GpuAllocEmpty('float16', ctx_name)( shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) return Gemm16()(C, 1.0, A, B, 0.0)
def local_gpua_hgemm(node): from theano.sandbox.cuda import nvcc_compiler if nvcc_compiler.nvcc_version < '7.5': _logger.warning("Not performing dot of float16 on the GPU since " "cuda 7.5 is not available. Updating could speed up " "your code.") return A = node.inputs[0] B = node.inputs[1] if (A.ndim == 2 and B.ndim == 2 and A.dtype == 'float16' and B.dtype == 'float16'): fgraph = node.inputs[0].fgraph C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
def test_nonstandard_shapes(): a = tensor3(config.floatX) a.tag.test_value = np.random.random((2, 3, 4)).astype(config.floatX) b = tensor3(theano.config.floatX) b.tag.test_value = np.random.random((2, 3, 4)).astype(config.floatX) tl = make_list([a, b]) tl_shape = shape(tl) assert np.array_equal(tl_shape.get_test_value(), (2, 2, 3, 4)) # There's no `FunctionGraph`, so it should return a `Subtensor` tl_shape_i = shape_i(tl, 0) assert isinstance(tl_shape_i.owner.op, Subtensor) assert tl_shape_i.get_test_value() == 2 tl_fg = FunctionGraph([a, b], [tl], features=[ShapeFeature()]) tl_shape_i = shape_i(tl, 0, fgraph=tl_fg) assert not isinstance(tl_shape_i.owner.op, Subtensor) assert tl_shape_i.get_test_value() == 2 none_shape = shape(NoneConst) assert np.array_equal(none_shape.get_test_value(), [])
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), conv_mode='conv', direction_hint=None): """ GPU convolution using cuDNN from NVIDIA. The memory layout to use is 'bc01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. :param img: images to do the convolution over :param kerns: convolution filters :param border_mode: one of 'valid', 'full'; additionally, the padding size could be directly specified by an integer or a pair of integers :param subsample: perform subsampling of the output (default: (1, 1)) :param conv_mode: perform convolution (kernels flipped) or cross-correlation. One of 'conv', 'cross'. (default: 'conv') :param direction_hint: Used by graph optimizers to change algorithm choice. By default, GpuDnnConv will be used to carry out the convolution. If border_mode is 'valid', subsample is (1,1) and direction_hint is 'bprop weights', it will use GpuDnnConvGradW. If border_mode is 'full', subsample is (1,1) and direction_hint is *not* 'forward!', it will use GpuDnnConvGradI. This parameter is used internally by graph optimizers and may be removed at any time without a deprecation period. You have been warned. :warning: The cuDNN library only works with GPU that have a compute capability of 3.0 or higer. This means that older GPU will not work with this Op. """ fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) if (border_mode == 'valid' and subsample == (1, 1) and direction_hint == 'bprop weights'): # Special case: We are asked to use GpuDnnConvGradW. We need to set # up a suitable 'fake' convolution to compute the gradient for. img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) if conv_mode == 'conv': # We need to flip manually. These 'kerns' are not the kernels # that would be flipped by conv_mode='conv' in GpuDnnConvGradW. kerns = kerns[:, :, ::-1, ::-1] kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) shape = theano.tensor.stack(kerns.shape[1], img.shape[1], img.shape[2] - kerns.shape[2] + 1, img.shape[3] - kerns.shape[3] + 1) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode='cross')(img.shape, shape) conv = GpuDnnConvGradW()(img, kerns, desc, shape[2], shape[3]) return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3)) elif (border_mode == 'full' and subsample == (1, 1) and direction_hint != 'forward!'): # Special case: We can be faster by using GpuDnnConvGradI to compute # the full convolution as the backward pass of a valid convolution. # We just need to set up a suitable 'fake' valid convolution. img = gpu_contiguous(img) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) conv_mode = 'cross' if conv_mode == 'conv' else 'conv' shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape = theano.tensor.stack(shape_i(img, 0, fgraph), shape_i(kerns, 1, fgraph), shape2, shape3) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode=conv_mode)(shape, kerns.shape) return GpuDnnConvGradI()(kerns, img, desc, shape2, shape3) # Standard case: We use GpuDnnConv with suitable padding. img = gpu_contiguous(img) kerns = gpu_contiguous(kerns) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(img.shape, kerns.shape) return GpuDnnConv()(img, kerns, desc)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), conv_mode='conv', direction_hint=None): """ GPU convolution using cuDNN from NVIDIA. The memory layout to use is 'bc01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. :param img: images to do the convolution over :param kerns: convolution filters :param border_mode: one of 'valid', 'full'; additionally, the padding size could be directly specified by an integer or a pair of integers :param subsample: perform subsampling of the output (default: (1, 1)) :param conv_mode: perform convolution (kernels flipped) or cross-correlation. One of 'conv', 'cross'. (default: 'conv') :param direction_hint: Used by graph optimizers to change algorithm choice. By default, GpuDnnConv will be used to carry out the convolution. If border_mode is 'valid', subsample is (1,1) and direction_hint is 'bprop weights', it will use GpuDnnConvGradW. If border_mode is 'full', subsample is (1,1) and direction_hint is *not* 'forward!', it will use GpuDnnConvGradI. This parameter is used internally by graph optimizers and may be removed at any time without a deprecation period. You have been warned. :warning: The cuDNN library only works with GPU that have a compute capability of 3.0 or higer. This means that older GPU will not work with this Op. """ fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) if (border_mode == 'valid' and subsample == (1,1) and direction_hint == 'bprop weights'): # Special case: We are asked to use GpuDnnConvGradW. We need to set # up a suitable 'fake' convolution to compute the gradient for. img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) if conv_mode == 'conv': # We need to flip manually. These 'kerns' are not the kernels # that would be flipped by conv_mode='conv' in GpuDnnConvGradW. kerns = kerns[:, :, ::-1, ::-1] kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) shape = theano.tensor.stack(kerns.shape[1], img.shape[1], img.shape[2] - kerns.shape[2] + 1, img.shape[3] - kerns.shape[3] + 1) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode='cross')(img.shape, shape) conv = GpuDnnConvGradW()(img, kerns, desc, shape[2], shape[3]) return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3)) elif (border_mode == 'full' and subsample == (1, 1) and direction_hint != 'forward!'): # Special case: We can be faster by using GpuDnnConvGradI to compute # the full convolution as the backward pass of a valid convolution. # We just need to set up a suitable 'fake' valid convolution. img = gpu_contiguous(img) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) conv_mode = 'cross' if conv_mode == 'conv' else 'conv' shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape = theano.tensor.stack(shape_i(img, 0, fgraph), shape_i(kerns, 1, fgraph), shape2, shape3) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode=conv_mode)(shape, kerns.shape) return GpuDnnConvGradI()(kerns, img, desc, shape2, shape3) # Standard case: We use GpuDnnConv with suitable padding. img = gpu_contiguous(img) kerns = gpu_contiguous(kerns) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(img.shape, kerns.shape) return GpuDnnConv()(img, kerns, desc)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), conv_mode='conv', direction_hint=None, workmem=None, algo=None): """ GPU convolution using cuDNN from NVIDIA. The memory layout to use is 'bc01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. Parameters ---------- img Images to do the convolution over. kerns Convolution filters. border_mode One of 'valid', 'full'; additionally, the padding size could be directly specified by an integer or a pair of integers. subsample Perform subsampling of the output (default: (1, 1)). conv_mode Perform convolution (kernels flipped) or cross-correlation. One of 'conv', 'cross' (default: 'conv'). direction_hint Used by graph optimizers to change algorithm choice. By default, GpuDnnConv will be used to carry out the convolution. If border_mode is 'valid', subsample is (1, 1) and direction_hint is 'bprop weights', it will use GpuDnnConvGradW. If border_mode is 'full', subsample is (1, 1) and direction_hint is *not* 'forward!', it will use GpuDnnConvGradI. This parameter is used internally by graph optimizers and may be removed at any time without a deprecation period. You have been warned. algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'} Convolution implementation to use. Some of its values may require certain versions of CuDNN to be installed. Default is the value of :attr:`config.dnn.conv.algo_fwd`. .. warning:: The cuDNN library only works with GPUs that have a compute capability of 3.0 or higer. This means that older GPUs will not work with this Op. """ if workmem is not None: if algo is not None: raise ValueError("You can't use both algo and workmem") warnings.warn("workmem is deprecated, use algo instead", stacklevel=2) algo = workmem fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) if (border_mode == 'valid' and subsample == (1, 1) and direction_hint == 'bprop weights'): # Special case: We are asked to use GpuDnnConvGradW. We need to set # up a suitable 'fake' convolution to compute the gradient for. img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) if conv_mode == 'conv': # We need to flip manually. These 'kerns' are not the kernels # that would be flipped by conv_mode='conv' in GpuDnnConvGradW. kerns = kerns[:, :, ::-1, ::-1] kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1 shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1 out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph), shape_i(img, 1, fgraph), shape2, shape3) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode='cross')(out.shape) conv = GpuDnnConvGradW()(img, kerns, out, desc) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3)) elif (border_mode == 'full' and subsample == (1, 1) and direction_hint != 'forward!'): # Special case: We can be faster by using GpuDnnConvGradI to compute # the full convolution as the backward pass of a valid convolution. # We just need to set up a suitable 'fake' valid convolution. img = gpu_contiguous(img) # cudnn v2 rc3 need contiguous data kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) conv_mode = 'cross' if conv_mode == 'conv' else 'conv' shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 out = GpuAllocEmpty(img.dtype)(shape_i(img, 0, fgraph), shape_i(kerns, 1, fgraph), shape2, shape3) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode=conv_mode)(kerns.shape) return GpuDnnConvGradI()(kerns, img, out, desc) # Standard case: We use GpuDnnConv with suitable padding. # contig_version will return a gpu_contiguous copy # if the img contains negative strides img = gpu_contiguous(img) kerns = gpu_contiguous(kerns) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(kerns.shape) desc_op = desc.owner.op out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, desc_op.border_mode, desc_op.subsample) out = GpuAllocEmpty(img.dtype)(*out_shp) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def local_gpua_careduce(node, context_name): if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, scalar.Maximum, scalar.Minimum)): ctx = get_context(context_name) if ctx.kind == b'opencl': op = GpuCAReduceCPY if node.op.scalar_op not in [scalar.add, scalar.mul]: # We don't support yet all reduction with cpy code. return elif ctx.kind == b'cuda': op = GpuCAReduceCuda else: return False x, = node.inputs greduce = op(node.op.scalar_op, axis=node.op.axis, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) gvar = greduce(x) # We need to have the make node called, otherwise the mask can # be None if (op is GpuCAReduceCPY or gvar.owner.op.supports_c_code( [as_gpuarray_variable(x, context_name)])): return greduce else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have # the same value in the reduce_mask, then we can reshape # to make them a single dimension, do the reduction, and # then reshape to get them back. if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 new_in_shp = [shape_i(x, 0)] new_mask = [reduce_mask[0]] for i in xrange(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= shape_i(x, i) else: new_mask.append(reduce_mask[i]) new_in_shp.append(shape_i(x, i)) new_axis = [] for idx, m in enumerate(new_mask): if m == 1: new_axis.append(idx) greduce = op(node.op.scalar_op, axis=new_axis, reduce_mask=new_mask, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) reshaped_x = x.reshape(tensor.stack(new_in_shp)) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) gvar = greduce(gpu_reshaped_x) # We need to have the make node called, otherwise the mask can # be None reshaped_gpu_inputs = [gpu_reshaped_x] if greduce.supports_c_code(reshaped_gpu_inputs): reduce_reshaped_x = host_from_gpu(greduce(gpu_reshaped_x)) if reduce_reshaped_x.ndim != node.outputs[0].ndim: out_shp = [] for i in range(x.ndim): if i not in node.op.axis: out_shp.append(shape_i(x, i)) unreshaped_reduce = reduce_reshaped_x.reshape( tensor.stack(out_shp)) else: unreshaped_reduce = reduce_reshaped_x return [unreshaped_reduce]
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), conv_mode='conv', direction_hint=None, workmem=None, algo=None): """ GPU convolution using cuDNN from NVIDIA. The memory layout to use is 'bc01', that is 'batch', 'channel', 'first dim', 'second dim' in that order. Parameters ---------- img Images to do the convolution over. kerns Convolution filters. border_mode One of 'valid', 'full'; additionally, the padding size could be directly specified by an integer or a pair of integers. subsample Perform subsampling of the output (default: (1, 1)). conv_mode Perform convolution (kernels flipped) or cross-correlation. One of 'conv', 'cross' (default: 'conv'). direction_hint Used by graph optimizers to change algorithm choice. By default, GpuDnnConv will be used to carry out the convolution. If border_mode is 'valid', subsample is (1, 1) and direction_hint is 'bprop weights', it will use GpuDnnConvGradW. If border_mode is 'full', subsample is (1, 1) and direction_hint is *not* 'forward!', it will use GpuDnnConvGradI. This parameter is used internally by graph optimizers and may be removed at any time without a deprecation period. You have been warned. algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'} Convolution implementation to use. Some of its values may require certain versions of CuDNN to be installed. Default is the value of :attr:`config.dnn.conv.algo_fwd`. .. warning:: The cuDNN library only works with GPUs that have a compute capability of 3.0 or higer. This means that older GPUs will not work with this Op. """ if workmem is not None: if algo is not None: raise ValueError("You can't use both algo and workmem") warnings.warn("workmem is deprecated, use algo instead", stacklevel=2) algo = workmem fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) ctx_name = infer_context_name(img, kerns) if (border_mode == 'valid' and subsample == (1, 1) and direction_hint == 'bprop weights'): # Special case: We are asked to use GpuDnnConvGradW. We need to set # up a suitable 'fake' convolution to compute the gradient for. img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) if conv_mode == 'conv': # We need to flip manually. These 'kerns' are not the kernels # that would be flipped by conv_mode='conv' in GpuDnnConvGradW. kerns = kerns[:, :, ::-1, ::-1] kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1 shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1 out = GpuAllocEmpty(img.dtype, ctx_name)( shape_i(kerns, 1, fgraph), shape_i(img, 1, fgraph), shape2, shape3) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode='cross')(out.shape) conv = GpuDnnConvGradW()(img, kerns, out, desc) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) elif (border_mode == 'full' and subsample == (1, 1) and direction_hint != 'forward!'): # Special case: We can be faster by using GpuDnnConvGradI to compute # the full convolution as the backward pass of a valid convolution. # We just need to set up a suitable 'fake' valid convolution. img = gpu_contiguous(img) # cudnn v2 rc3 need contiguous data kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) conv_mode = 'cross' if conv_mode == 'conv' else 'conv' shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph), shape_i(kerns, 1, fgraph), shape2, shape3) desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode=conv_mode)(kerns.shape) return GpuDnnConvGradI()(kerns, img, out, desc) # Standard case: We use GpuDnnConv with suitable padding. # contig_version will return a gpu_contiguous copy # if the img contains negative strides img = gpu_contiguous(img) kerns = gpu_contiguous(kerns) desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, conv_mode=conv_mode)(kerns.shape) desc_op = desc.owner.op out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, desc_op.border_mode, desc_op.subsample) out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def local_gpua_careduce(node, context_name): if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, scalar.Maximum, scalar.Minimum)): ctx = get_context(context_name) if ctx.kind == b'opencl': op = GpuCAReduceCPY if node.op.scalar_op not in [scalar.add, scalar.mul]: # We don't support yet all reduction with cpy code. return elif ctx.kind == b'cuda': op = GpuCAReduceCuda else: return False x, = node.inputs greduce = op( node.op.scalar_op, axis=node.op.axis, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) gvar = greduce(x) # We need to have the make node called, otherwise the mask can # be None if (op is GpuCAReduceCPY or gvar.owner.op.supports_c_code([ as_gpuarray_variable(x, context_name)])): return greduce else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have # the same value in the reduce_mask, then we can reshape # to make them a single dimension, do the reduction, and # then reshape to get them back. if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 new_in_shp = [shape_i(x, 0)] new_mask = [reduce_mask[0]] for i in xrange(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= shape_i(x, i) else: new_mask.append(reduce_mask[i]) new_in_shp.append(shape_i(x, i)) new_axis = [] for idx, m in enumerate(new_mask): if m == 1: new_axis.append(idx) greduce = op( node.op.scalar_op, axis=new_axis, reduce_mask=new_mask, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) reshaped_x = x.reshape(tensor.stack(new_in_shp)) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) gvar = greduce(gpu_reshaped_x) # We need to have the make node called, otherwise the mask can # be None reshaped_gpu_inputs = [gpu_reshaped_x] if greduce.supports_c_code(reshaped_gpu_inputs): reduce_reshaped_x = host_from_gpu( greduce(gpu_reshaped_x)) if reduce_reshaped_x.ndim != node.outputs[0].ndim: out_shp = [] for i in range(x.ndim): if i not in node.op.axis: out_shp.append(shape_i(x, i)) unreshaped_reduce = reduce_reshaped_x.reshape( tensor.stack(out_shp)) else: unreshaped_reduce = reduce_reshaped_x return [unreshaped_reduce]