# do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray") def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop("name")) or local_opt.__name__ gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags) return local_opt return f register_opt("fast_compile")(theano.tensor.opt.local_track_shape_i) gpu_optimizer.register("local_remove_all_assert", theano.tensor.opt.local_remove_all_assert, "unsafe") def safe_to_gpu(x, ctx_name): if isinstance(x.type, tensor.TensorType): return GpuFromHost(ctx_name)(x) else: return x def safe_to_cpu(x): if isinstance(x.type, GpuArrayType): return host_from_gpu(x) else: return x
optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) gpu_optimizer.register('local_remove_all_assert', theano.tensor.opt.local_remove_all_assert, 'unsafe') def safe_to_gpu(x): if isinstance(x.type, tensor.TensorType): return gpu_from_host(x) else: return x def safe_to_cpu(x): if isinstance(x.type, GpuArrayType): return host_from_gpu(x) else: return x
except TypeError, e: # This could fail if the inputs are not TensorTypes pass gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), 0, 'fast_run', 'fast_compile', 'merge') @local_optimizer([gpu_from_host, host_from_gpu]) def local_cut_gpu_host_gpu(node): if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu): return [node.inputs[0].owner.inputs[0]] if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host): return [node.inputs[0].owner.inputs[0]] return False gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu, 'fast_run', 'inplace', 'gpuarray') gpu_cut_copies.register('cut_gpua_constant_transfers', tensor.opt.constant_folding, 'fast_run', 'gpuarray') optdb['canonicalize'].register('local_cut_gpua_host_gpua', local_cut_gpu_host_gpu, 'fast_run', 'gpuarray') @register_opt() @op_lifter([tensor.Alloc]) def local_gpualloc(node): return gpu_alloc @register_opt() @local_optimizer([GpuAlloc])
@local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp( node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) """ @local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise_kernel(node): "" GpuElemwise -> PycudaElemwiseKernelOp "" if isinstance(node.op, GpuElemwise): if not any([any(i.type.broadcastable) for i in node.inputs]): new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)( *node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5)
def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ optdb.register( name, TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace), 60, 'fast_run', 'inplace', 'gpuarray', *tags) return local_opt return f register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) register_opt(final_opt=True, name='gpua_constant_folding')(tensor.opt.constant_folding) gpu_optimizer.register('local_remove_all_assert', theano.tensor.opt.local_remove_all_assert, 'unsafe') def safe_to_gpu(x, ctx_name): if isinstance(x.type, tensor.TensorType): return GpuFromHost(ctx_name)(x) else: return x def safe_to_cpu(x): if isinstance(x.type, GpuArrayType): return host_from_gpu(x) else: return x
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), 0, 'fast_run', 'fast_compile', 'merge') @local_optimizer([]) def local_cut_gpu_host_gpu(node): if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu): return [node.inputs[0].owner.inputs[0]] if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host): return [node.inputs[0].owner.inputs[0]] return False gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu, 'fast_run', 'inplace', 'gpuarray') gpu_cut_copies.register('cut_gpua_constant_transfers', tensor.opt.constant_folding, 'fast_run', 'gpuarray') optdb['canonicalize'].register('local_cut_gpua_host_gpua', local_cut_gpu_host_gpu, 'fast_run', 'gpuarray') @register_opt() @op_lifter(tensor.Alloc) def local_gpualloc(node): return gpu_alloc @register_opt() @op_lifter(tensor.Reshape) def local_gpureshape(node):
pass gpu_seqopt.register("InputToGpuArrayOptimizer", InputToGpuOptimizer(), 0, "fast_run", "fast_compile", "merge") @local_optimizer([gpu_from_host, host_from_gpu]) def local_cut_gpu_host_gpu(node): if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu): return [node.inputs[0].owner.inputs[0]] if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host): return [node.inputs[0].owner.inputs[0]] return False gpu_cut_copies.register("cut_gpua_host_transfers", local_cut_gpu_host_gpu, "fast_run", "inplace", "gpuarray") gpu_cut_copies.register("cut_gpua_constant_transfers", tensor.opt.constant_folding, "fast_run", "gpuarray") optdb["canonicalize"].register("local_cut_gpua_host_gpua", local_cut_gpu_host_gpu, "fast_run", "gpuarray") @register_opt() @local_optimizer([tensor.Alloc]) def local_gpuaalloc2(node): """ Join(axis, Alloc, Alloc, ...) -> Join(axis, GpuAlloc, Alloc, ...) Moves an alloc that is an input to join to the gpu. """ if isinstance(node.op, tensor.Alloc) and all( c != "output" and c.op == tensor.join
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) i = inputs + z self.pycuda_fct(*i) pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]): new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) @local_optimizer([]) def local_pycuda_gpu_elemwise_kernel(node): """ GpuElemwise -> PycudaElemwiseKernelOp """ if isinstance(node.op, GpuElemwise): if not any([ any(i.type.broadcastable) for i in node.inputs]): new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5)
@local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)( *node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) """ @local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise_kernel(node): "" GpuElemwise -> PycudaElemwiseKernelOp "" if isinstance(node.op, GpuElemwise): if not any([any(i.type.broadcastable) for i in node.inputs]): new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)( *node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
@local_optimizer([]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)( *node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) @local_optimizer([]) def local_pycuda_gpu_elemwise_kernel(node): """ GpuElemwise -> PycudaElemwiseKernelOp """ if isinstance(node.op, GpuElemwise): if not any([any(i.type.broadcastable) for i in node.inputs]): new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)( *node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",