import stat import sys import theano from theano.compat import get_unbound_function from theano.compile import optdb from theano.gof import EquilibriumDB, SequenceDB from theano.gof.cmodule import get_lib_extension from theano.gof.compilelock import get_lock, release_lock from theano.configparser import config, AddConfigVar, StrParam, BoolParam import nvcc_compiler # ignore_newtrees is to speed the optimization as this is the pattern # we use for optimization. Otherwise, we can iterate 100s of time on # the graph and apply only a few optimizations each time. gpu_optimizer = EquilibriumDB(ignore_newtrees=False) gpu_seqopt = SequenceDB() def register_opt(*tags, **kwargs): if any([not isinstance(t, str) for t in tags]): raise RuntimeError( "Bad call to register_opt." " All tags must be strings.", tags) def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile', 'gpu', *tags) return local_opt
GpuSoftmaxWithBias, GpuSoftmax, ) from .elemwise import GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, GpuCAReduceCPY from .subtensor import ( GpuIncSubtensor, GpuSubtensor, GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, ) from .opt_util import alpha_merge, output_merge _logger = logging.getLogger("theano.sandbox.gpuarray.opt") gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() # Don't register this right now conv_groupopt = LocalGroupDB() conv_groupopt.__name__ = "gpua_conv_opts" gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_compile", "fast_run", "gpuarray") gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_compile", "fast_run", "gpuarray") # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray")
HostFromGpu, GpuFromHost, GpuSplit, GpuContiguous, gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin) from .blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer from .conv import GpuConv from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx, GpuSoftmaxWithBias, GpuSoftmax) from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, GpuCAReduceCPY) from .subtensor import (GpuIncSubtensor, GpuSubtensor, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20) gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_compile', 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_compile', 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray')
gpu_alloc, gpu_shape, GpuAlloc, GpuShape, GpuReshape, GpuEye) from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx) from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, GpuDimShuffle, GpuCAReduce) from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.type import GpuArrayConstant gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs):
block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1) pycuda_fct(inputs[0][0], inputs[1][0], z[0], numpy.intc(inputs[1][0].size), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp( node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op]
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch, gpugemm_no_inplace, gpugemmbatch_no_inplace) from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx, GpuSoftmaxWithBias, GpuSoftmax) from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, GpuCAReduceCPY) from .subtensor import (GpuIncSubtensor, GpuSubtensor, GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20) from .opt_util import alpha_merge, output_merge _logger = logging.getLogger("theano.gpuarray.opt") gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() # Don't register this right now conv_groupopt = LocalGroupDB() conv_groupopt.__name__ = "gpua_conv_opts" gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_compile', 'fast_run', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_compile', 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt,
from theano.compile import optdb from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, Optimizer, toolbox, DestroyHandler, InconsistencyError, EquilibriumOptimizer) from theano.gof.python25 import all, any from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host, gpu_alloc, GpuReshape) from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, GpuDimShuffle, GpuCAReduce) from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray')
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx, GpuSoftmaxWithBias, GpuSoftmax, ) from theano.sandbox.gpuarray.elemwise import GpuElemwise, _is_scalar, GpuDimShuffle, GpuCAReduceCuda from theano.sandbox.gpuarray.subtensor import ( GpuIncSubtensor, GpuSubtensor, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, ) from theano.sandbox.gpuarray.type import GpuArrayConstant gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_run", "inplace", "gpuarray") gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_run", "gpuarray") # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray") def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop("name")) or local_opt.__name__ gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags) return local_opt
", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]), c_code, "pycuda_elemwise_kernel_%s"%str(self.scalar_op), preamble="""#include<Python.h> #include <numpy/arrayobject.h>""") return out_node def perform(self, node, inputs, out): #TODO assert all input have the same shape z, = out if z[0] is None or z[0].shape!=inputs[0].shape: z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) i = inputs + z self.pycuda_fct(*i) pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]): new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) @local_optimizer([])
grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1) block = (512, 1, 1) else: grid = (1, 1) block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1) pycuda_fct(inputs[0][0], inputs[1][0], z[0], numpy.intc(inputs[1][0].size), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)( *node.inputs) return [new_op]
grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1) block = (512, 1, 1) else: grid = (1, 1) block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1) out = pycuda_fct(inputs[0][0], inputs[1][0], z[0], numpy.intc(inputs[1][0].size), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)( *node.inputs) return [new_op]