def __init__(self, pool_shape, inplace, BCHW_grad_output): pool_shape = tuple(pool_shape) super(PoolHWBCOpGrad, self).__init__() assert len(pool_shape) == 2, len(pool_shape) assert pool_shape[0] > 0, pool_shape[0] assert pool_shape[1] > 0, pool_shape[1] if BCHW_grad_output: assert inplace self.pool_shape = pool_shape self.inplace = inplace self.BCHW_grad_output = BCHW_grad_output if inplace: self.destroy_map = {0: [0]} #register optimization for this pool_shape else: if not hasattr(optdb, 'PoolHWBCOpGradInplaceOpt_registered'): optdb.PoolHWBCOpGradInplaceOpt_registered = [] if pool_shape not in optdb.PoolHWBCOpGradInplaceOpt_registered: PoolHWBCOpGradInplaceOpt = OpSub( self, PoolHWBCOpGrad(self.pool_shape, inplace=True, BCHW_grad_output=False)) optdb.PoolHWBCOpGradInplaceOpt_registered.append(pool_shape) optdb.register( 'PoolHWBCOpGradInplaceOpt' + str(pool_shape), theano.gof.TopoOptimizer( PoolHWBCOpGradInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run', 'inplace', 'gpuarray')
def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ optdb.register( name, TopoOptimizer( local_opt, failure_callback=TopoOptimizer.warn_inplace), 60, 'fast_run', 'inplace', 'gpu', *tags) return local_opt
def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ optdb.register( name, TopoOptimizer( local_opt, failure_callback=TopoOptimizer.warn_inplace), 60, 'fast_run', 'inplace', 'gpuarray', *tags) return local_opt
def f(local_opt): name = (kwargs and kwargs.pop("name")) or local_opt.__name__ optdb.register( name, TopoOptimizer(local_opt, failure_callback=TopoOptimizer.warn_inplace), 60, "fast_run", "inplace", "gpuarray", *tags, ) return local_opt
def register_func(recurrent_transform): """ :type recurrent_transform: RecurrentTransform.RecurrentTransformBase """ fn = recurrent_transform.name key = (fn, id(recurrent_transform)) if key in function_ops: return function_ops[key] # register op no_inpl = LSTMCustomOp(fun_name=fn, inplace=False, recurrent_transform=recurrent_transform) inpl = LSTMCustomOp(fun_name=fn, inplace=True, recurrent_transform=recurrent_transform) function_ops[key] = no_inpl # hack to avoid being called twice attr = 'LSTMCustomMOpInplaceOpt_%s_%i' % (fn, id(recurrent_transform)) if not hasattr(optdb, attr): opt = OpSub(no_inpl, inpl) optdb.register(attr, theano.gof.TopoOptimizer(opt), 50.0, 'fast_run', 'inplace', 'gpuarray') setattr(optdb, attr, True) # the same for grad no_inpl = LSTMCustomOpGrad(fun_name=fn, inplace=False, recurrent_transform=recurrent_transform) inpl = LSTMCustomOpGrad(fun_name=fn, inplace=True, recurrent_transform=recurrent_transform) grad_ops[key] = no_inpl # hack to avoid being called twice attr = 'LSTMCustomMOpGradInplaceOpt_%s_%i' % (fn, id(recurrent_transform)) if not hasattr(optdb, attr): opt = OpSub(no_inpl, inpl) optdb.register(attr, theano.gof.TopoOptimizer(opt), 50.0, 'fast_run', 'inplace', 'gpuarray') setattr(optdb, attr, True) return function_ops[key]
def __init__(self, pool_shape, inplace, BCHW_grad_output): pool_shape = tuple(pool_shape) super(PoolHWBCOpGrad, self).__init__() assert len(pool_shape) == 2, len(pool_shape) assert pool_shape[0] > 0, pool_shape[0] assert pool_shape[1] > 0, pool_shape[1] if BCHW_grad_output: assert inplace self.pool_shape = pool_shape self.inplace = inplace self.BCHW_grad_output = BCHW_grad_output if inplace: self.destroy_map = {0: [0]} #register optimization for this pool_shape else: if not hasattr(optdb, 'PoolHWBCOpGradInplaceOpt_registered'): optdb.PoolHWBCOpGradInplaceOpt_registered = [] if pool_shape not in optdb.PoolHWBCOpGradInplaceOpt_registered: PoolHWBCOpGradInplaceOpt = OpSub(self, PoolHWBCOpGrad(self.pool_shape, inplace=True, BCHW_grad_output=False)) optdb.PoolHWBCOpGradInplaceOpt_registered.append(pool_shape) optdb.register('PoolHWBCOpGradInplaceOpt' + str(pool_shape), theano.gof.TopoOptimizer(PoolHWBCOpGradInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run', 'inplace', 'gpuarray')
# we wont need this copy anymore output[0] = variable.copy() @gof.local_optimizer([OpFromGraph]) def inline_ofg_expansion(node): """ This optimization expands internal graph of OpFromGraph. Only performed if node.op.is_inline == True Doing so can improve optimization at the cost of compilation speed. """ op = node.op if not isinstance(op, OpFromGraph): return False if not op.is_inline: return False return theano.clone( op.local_outputs, {u: v for u, v in izip(node.op.local_inputs, node.inputs)}) # We want to run this before the first merge optimizer # and before the first scan optimizer. optdb.register('inline_ofg_expansion', gof.opt.in2out(inline_ofg_expansion), -0.01, 'fast_compile', 'fast_run') # Since OpFromGraph contains a Theano compiled function, # we should let DebugMode know about it ops_with_inner_function[OpFromGraph] = 'fn'
inplace=False) CuDNNConvHWBCOpGradValidInplaceInstance = CuDNNConvHWBCOpGrad("valid", inplace=True) CuDNNConvHWBCOpGradFullNoInplaceInstance = CuDNNConvHWBCOpGrad("full", inplace=False) CuDNNConvHWBCOpGradFullInplaceInstance = CuDNNConvHWBCOpGrad("full", inplace=True) CuDNNConvHWBCOpGradValidInplaceOpt = OpSub( CuDNNConvHWBCOpGradValidNoInplaceInstance, CuDNNConvHWBCOpGradValidInplaceInstance) #hack to avoid being called twice if not hasattr(optdb, 'CuDNNConvHWBCOpGradValidInplaceOpt_registered'): optdb.register( 'CuDNNConvHWBCOpGradValidInplaceOpt', theano.gof.TopoOptimizer( CuDNNConvHWBCOpGradValidInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run', 'inplace', 'gpuarray') optdb.CuDNNConvHWBCOpGradValidInplaceOpt_registered = True #TODO: maybe this optimization causes problems #CuDNNConvHWBCOpGradFullInplaceOpt = OpSub(CuDNNConvHWBCOpGradFullNoInplaceInstance, CuDNNConvHWBCOpGradFullInplaceInstance) ##hack to avoid being called twice #if not hasattr(optdb, 'CuDNNConvHWBCOpGradFullInplaceOpt_registered'): # optdb.register('CuDNNConvHWBCOpGradFullInplaceOpt', # theano.gof.TopoOptimizer(CuDNNConvHWBCOpGradFullInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace), # 50.0, 'fast_run', 'inplace', 'gpuarray') # optdb.CuDNNConvHWBCOpGradFullInplaceOpt_registered = True #------------------------------------------------------
def infer_shape(self, node, input_shapes): return input_shapes[0], #def c_code_cache_version(self): # return 1, 0 CropToBatchImageSizeInstance = CropToBatchImageSizeOp(-1e20, False) CropToBatchImageSizeInplaceInstance = CropToBatchImageSizeOp(-1e20, True) CropToBatchImageSizeZeroInstance = CropToBatchImageSizeOp(0.0, False) CropToBatchImageSizeZeroInplaceInstance = CropToBatchImageSizeOp(0.0, True) CropToBatchImageSizeGradInplaceOpt1 = OpSub(CropToBatchImageSizeInstance, CropToBatchImageSizeInplaceInstance) #hack to avoid being called twice if not hasattr(optdb, 'CropToBatchImageSizeGradInplaceOpt1_registered'): optdb.register('CropToBatchImageSizeGradInplaceOpt1', theano.gof.TopoOptimizer(CropToBatchImageSizeGradInplaceOpt1, failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run', 'inplace', 'gpuarray') optdb.CropToBatchImageSizeGradInplaceOpt1_registered = True CropToBatchImageSizeGradInplaceOpt2 = OpSub(CropToBatchImageSizeZeroInstance, CropToBatchImageSizeZeroInplaceInstance) #hack to avoid being called twice if not hasattr(optdb, 'CropToBatchImageSizeGradInplaceOpt2_registered'): optdb.register('CropToBatchImageSizeGradInplaceOpt2', theano.gof.TopoOptimizer(CropToBatchImageSizeGradInplaceOpt2, failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run', 'inplace', 'gpuarray') optdb.CropToBatchImageSizeGradInplaceOpt2_registered = True
grad_op = grad_op.__class__(**kwargs) else: old_grad_op_input0 = grad_op_v.owner.inputs[0] sum_inputs = [old_grad_op_input0] + sum_inputs assert len(sum_inputs) > 0 if len(sum_inputs) == 1: new_grad_op_input0 = sum_inputs[0] else: new_grad_op_input0 = T.add(*sum_inputs) new_grad_op_inputs = [new_grad_op_input0] + grad_op_v.owner.inputs[1:] new_v = grad_op(*new_grad_op_inputs) return [new_v] optdb.register('add_merge_MultiBatchBeamGradAddOp', gof.TopoOptimizer(add_merge_MultiBatchBeamGradAddOp), 0.1, 'fast_run') @gof.local_optimizer([MultiBatchBeamGradAddOp], inplace=True) def inplace_MultiBatchBeamGradAddOp(node): if isinstance(node.op, MultiBatchBeamGradAddOp ) and not node.op.inplace and not node.op.zero_with_shape: kwargs = {k: getattr(node.op, k) for k in node.op.__props__} kwargs["inplace"] = True new_op = node.op.__class__(**kwargs) new_v = new_op(*node.inputs) return [new_v] return False
conv_groupopt.register('local_conv2d_gradinputs_cpu', local_conv2d_gradinputs_cpu, 40, 'fast_compile', 'fast_run') # Verify that no AbstractConv are present in the graph @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]) def local_abstractconv_check(node): if isinstance(node.op, AbstractConv2d): raise AssertionError( 'AbstractConv2d theano optimization failed. ' 'Did you exclude both "conv_dnn" and "conv_gemm" from ' 'the optimizer? Is cudnn available and does the GPU support it?') elif isinstance(node.op, AbstractConv2d_gradWeights): raise AssertionError( 'AbstractConv2d_gradWeights theano optimization failed. ' 'Did you exclude both "conv_dnn" and "conv_gemm" from ' 'the optimizer? Is cudnn available and does the GPU support it?') elif isinstance(node.op, AbstractConv2d_gradInputs): raise AssertionError( 'AbstractConv2d_gradInputs theano optimization failed. ' 'Did you exclude both "conv_dnn" and "conv_gemm" from ' 'the optimizer? Is cudnn available and does the GPU support it?') optdb.register('AbstracConvCheck', opt.in2out(local_abstractconv_check, name="AbstractConvCheck"), 48.7, 'fast_compile', 'fast_run')
@gof.local_optimizer([RandomFunction]) def random_make_inplace(node): op = node.op if isinstance(op, RandomFunction) and not op.inplace: # Read op_fn from op.state, not from op.fn, since op.fn # may not be picklable. op_fn, op_outtype, op_inplace, op_ndim_added = op._props() new_op = RandomFunction(op_fn, op_outtype, inplace=True, ndim_added=op_ndim_added) return new_op.make_node(*node.inputs).outputs return False optdb.register('random_make_inplace', opt.in2out(random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace') class RandomStreamsBase(object): def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64', prob=None): """ Sample n times with probability of success p for each trial and return the number of successes. If the size argument is ambiguous on the number of dimensions, ndim may be a plain integer to supplement the missing information. """
gpu_dot22 = GpuDot22() from theano.compile import optdb from theano.gof import local_optimizer, LocalOptGroup from theano.tensor.opt import in2out @local_optimizer([gpugemv_no_inplace], inplace=True) def local_inplace_gpuagemv(node): if node.op == gpugemv_no_inplace: return [gpugemv_inplace(*node.inputs)] @local_optimizer([gpugemm_no_inplace], inplace=True) def local_inplace_gpuagemm(node): if node.op == gpugemm_no_inplace: return [gpugemm_inplace(*node.inputs)] @local_optimizer([gpuger_no_inplace], inplace=True) def local_inplace_gpuager(node): if node.op == gpuger_no_inplace: return [gpuger_inplace(*node.inputs)] gpuablas_opt_inplace = in2out(LocalOptGroup( local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager), name='gpuablas_opt_inplace') optdb.register('InplaceGpuaBlasOpt', gpuablas_opt_inplace, 70.0, 'fast_run', 'inplace', 'gpuarray')
@opt.register_opt() @alpha_merge(Gemm16, alpha_in=1, beta_in=4) def local_gemm16_alpha_merge(node, *inputs): return [Gemm16(relu=node.op.relu)(*inputs)] @opt.register_opt() @output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0) def local_gemm16_output_merge(node, *inputs): return [Gemm16(relu=node.op.relu)(*inputs)] @local_optimizer([Gemm16], inplace=True) def local_gemm16_inplace(node): if type(node.op) != Gemm16 or node.op.inplace: return inputs = list(node.inputs) C = inputs[0] if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and len(C.clients) > 1): inputs[0] = C.owner.op(*C.owner.inputs) return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)] optdb.register('local_gemm16_inplace', tensor.opt.in2out(local_gemm16_inplace, name='local_gemm16_inplace'), 70.0, 'fast_run', 'inplace', 'gpuarray')
@inplace_allocempty(GpuDnnConvGradW, 2) def local_dnn_convgw_inplace(node, inputs): return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)] @inplace_allocempty(GpuDnnConvGradI, 2) def local_dnn_convgi_inplace(node, inputs): return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)] optdb.register( "local_dnna_conv_inplace", tensor.opt.in2out( local_dnn_conv_inplace, local_dnn_convgw_inplace, local_dnn_convgi_inplace, name="local_dnna_conv_inplace" ), 70.0, "fast_run", "inplace", "gpuarray", "cudnn", ) @register_opt("cudnn") @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5) def local_dnn_conv_alpha_merge(node, *inputs): return [GpuDnnConv(algo=node.op.algo)(*inputs)] @register_opt("cudnn") @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
} """ % locals() #!!! change this when changing the code! def c_code_cache_version(self): return 1, 5 LSTMOpGradNoInplaceInstance = LSTMOpGrad(inplace=False) LSTMOpGradInplaceInstance = LSTMOpGrad(inplace=True) LSTMOpGradInplaceOpt = OpSub(LSTMOpGradNoInplaceInstance, LSTMOpGradInplaceInstance) #hack to avoid being called twice if not hasattr(optdb, 'LSTMOpGradInplaceOpt_registered'): optdb.register('LSTMOpGradInplaceOpt', theano.gof.TopoOptimizer(LSTMOpGradInplaceOpt), 50.0, 'fast_run', 'inplace', 'gpuarray') optdb.LSTMOpGradInplaceOpt_registered = True #------------------------ class LSTMOp(theano.sandbox.cuda.GpuOp): def __init__(self, inplace): self.inplace = inplace if inplace: #all outputs operate inplace on input 0 (which is Z) #but when the input is marked multiple times, we get an error #so we only mark that output 0 destroys input 0 #anyway theano knows that input 0 will be destroyed, so it should be OK #TODO self.destroy_map = {0: [0]}
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx, GpuSoftmax, GpuSoftmaxWithBias, ) from theano.compile import optdb from theano.tensor.blas import _is_real_vector, _is_real_matrix # optdb.print_summary() # shows what is currently registered gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register("gpu_local_optimizations", gpu_optimizer, 1, "fast_run", "inplace") gpu_seqopt.register("gpu_cut_transfers", gpu_cut_copies, 2, "fast_run", "gpu") optdb.register("gpu_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpu") # This second pass is needed as the fusion can put all the non float32 code # inside the elemwise. When it there is no float64 op, this is working. optdb.register("gpu_after_fusion", ProxyDB(gpu_seqopt), optdb.__position__.get("elemwise_fusion", 71) + 0.1, "gpu") def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop("name")) or local_opt.__name__ gpu_optimizer.register(name, local_opt, "fast_run", "inplace", *tags) return local_opt return f # register local_track_shape_i at this level too
]), msg else: msg = "size must be a tuple of int or a Theano variable" assert isinstance(size, Variable) and size.ndim == 1, msg generator = theano.shared(False) # makes a generic s_size = theano.tensor.as_tensor_variable(size) u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size, self.next_seed()) self.state_updates.append(u.update) rval = u * std + avg if u.type.broadcastable != rval.type.broadcastable: raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low`' 'and `high` arguments') return rval @local_optimizer([None]) def local_destructive(node): op = node.op if isinstance(op, CURAND_Base) and not op.destructive: # op might be gpu version new_op = op.as_destructive() return new_op.make_node(*node.inputs).outputs return False optdb.register('CURAND_destructive', opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run', 'inplace')
any_inplace = False for info in kwargs["in_info"]: if info.get("want_inplace", -1) >= 0: any_inplace = True info["is_inplace"] = True if not any_inplace: return False new_op = node.op.__class__(**kwargs) from TheanoUtil import make_var_tuple new_v = make_var_tuple(new_op(*node.inputs)) return new_v return False optdb.register('inplace_NativeOp', gof.TopoOptimizer(inplace_NativeOp , failure_callback=gof.TopoOptimizer.warn_inplace ), 60, 'fast_run', 'inplace') @try_register_gpu_opt(NativeOp) def local_gpu_NativeOp(node): if isinstance(node.op, NativeOp): # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py from theano.sandbox.cuda import host_from_gpu, gpu_from_host, as_cuda_ndarray_variable args = node.inputs if any([(x.owner and x.owner.op == host_from_gpu) for x in args]): gpu_op = GpuNativeOp(**{key: getattr(node.op, key) for key in node.op.__props__}) args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x for x in args] from TheanoUtil import make_var_tuple
def add_requirements(self, fgraph): fgraph.attach_feature(toolbox.ReplaceValidate()) def apply(self, fgraph): for node in fgraph.toposort(): #print node if type(node.op) == GpuDimShuffle and node.op.new_order == (2, 3, 0, 1): X = node.inputs[0] if hasattr(X.owner, "op") and type(X.owner.op) == PoolHWBCOpGrad and X.owner.op.inplace: fgraph.replace_validate(node.outputs[0], node.inputs[0]) replace_op = PoolHWBCOpGrad(X.owner.op.pool_shape, inplace=True, BCHW_grad_output=True) fgraph.replace_validate(X.owner.outputs[0], replace_op(*X.owner.inputs)) RemoveConvGradDimshuffleOptimizer = RemoveConvGradDimshuffle() if not hasattr(optdb, 'RemoveConvGradDimshuffleOptimizer_registered'): optdb.register('RemoveConvGradDimshuffle', RemoveConvGradDimshuffleOptimizer, 50.5, 'fast_run', 'inplace', 'gpuarray') optdb.RemoveConvGradDimshuffleOptimizer_registered = True #--------------------------- #for the moment we implement only ignore_border = True and no padding class PoolHWBCOp(theano.sandbox.cuda.GpuOp): __props__ = ("pool_shape",) def __init__(self, pool_shape): pool_shape = tuple(pool_shape) super(PoolHWBCOp, self).__init__() assert len(pool_shape) == 2, len(pool_shape) assert pool_shape[0] > 0, pool_shape[0] assert pool_shape[1] > 0, pool_shape[1]
def local_dnn_conv_inplace(node, inputs): return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)] @inplace_allocempty(GpuDnnConvGradW, 2) def local_dnn_convgw_inplace(node, inputs): return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)] @inplace_allocempty(GpuDnnConvGradI, 2) def local_dnn_convgi_inplace(node, inputs): return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)] optdb.register('local_dnna_conv_inplace', tensor.opt.in2out(local_dnn_conv_inplace, local_dnn_convgw_inplace, local_dnn_convgi_inplace, name="local_dnna_conv_inplace"), 70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn') @register_opt('cudnn') @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5) def local_dnn_conv_alpha_merge(node, *inputs): return [GpuDnnConv(algo=node.op.algo)(*inputs)] @register_opt('cudnn') @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5) def local_dnn_convw_alpha_merge(node, *inputs): return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
dout = T.as_tensor_variable(dout) return [dout] @gof.local_optimizer([Contiguous], inplace=True) def opt_remove_contiguous(node): if isinstance(node.op, Contiguous): x, = node.inputs if x.owner and isinstance( x.owner.op, (T.Alloc, T.AllocEmpty, T.extra_ops.CpuContiguous)): return [x] return False optdb.register('opt_remove_contiguous', gof.TopoOptimizer(opt_remove_contiguous), 10, 'fast_run') # Theano will not do this optimization. So we register it now. # See: https://github.com/Theano/Theano/issues/4400 @try_register_gpu_opt(Contiguous) def local_gpu_Contiguous(node): if isinstance(node.op, Contiguous): # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py from theano.sandbox.cuda import host_from_gpu x, = node.inputs if x.owner and x.owner.op == host_from_gpu: from theano.sandbox.cuda.basic_ops import gpu_contiguous return [host_from_gpu(gpu_contiguous(x.owner.inputs[0]))]
if len(nw_inner) != len(op_ins): op_outs = scan_utils.clone(op_outs, replace=givens) nw_info = op.info.copy() nw_info['n_seqs'] = nw_n_seqs # DEBUG CHECK nwScan = scan_op.Scan(nw_inner, op_outs, nw_info) nw_outs = nwScan.make_node(*nw_outer).outputs return nw_outs else: return False scan_seqopt = theano.gof.SequenceDB() # We run before blas opt at 1.7 and specialize 2.0 # but after stabilize at 1.5. Should we put it before stabilize? optdb.register('scan_seqopt', scan_seqopt, 1.6, 'fast_run', 'scan') scan_seqopt.register('scanOp_remove_constants_and_unused_inputs', opt.in2out(remove_constants_and_unused_inputs_scan, ignore_newtrees=True), 5, 'fast_run', 'scan') # This is a global opt for historical reason # It should be possible to change it to a local opt. class PushOutNonSeqScan(gof.Optimizer): def __init__(self): gof.Optimizer.__init__(self)
assert all([isinstance(i, int) or isinstance(i, Variable) for i in size]), msg else: msg = "size must be a tuple of int or a Theano variable" assert isinstance(size, Variable) and size.ndim == 1, msg generator = theano.shared(False) # makes a generic s_size = theano.tensor.as_tensor_variable(size) u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size, self.next_seed()) self.state_updates.append(u.update) rval = u * std + avg if u.type.broadcastable != rval.type.broadcastable: raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low`' 'and `high` arguments' ) return rval @local_optimizer([CURAND_Base]) def local_destructive(node): op = node.op if isinstance(op, CURAND_Base) and not op.destructive: # op might be gpu version new_op = op.as_destructive() return new_op.make_node(*node.inputs).outputs return False optdb.register('CURAND_destructive', opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run', 'inplace')
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20) gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_compile', 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_compile', 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) gpu_optimizer.register('local_remove_all_assert', theano.tensor.opt.local_remove_all_assert, 'unsafe')
a = LargeSparseTargets(what_to_output=2).make_node(*fnode.inputs) f, g = a.outputs z = fnode.outputs[0] fgraph.replace_validate(z, f, "replace by a cost+grad op") for gnode in gnodes: z = gnode.outputs[0] fgraph.replace_validate(z, g, "replace by a cost+grad op") mergelst = MergeLargeSparseTargetOps() #optdb['specialize'].register('merge_large_sparse_target_ops', mergelst, 'fast_run') optdb.register("global_large_sparse_targets_merge", mergelst, 48.5, "fast_run") # add CPU TO GPU merge #@register_specialize #@local_optimizer([LargeSparseTargets]) def local_large_sparse_targets_gpu(node): if not isinstance(node.op, LargeSparseTargets) or theano.config.device == "cpu": return False if node.op.what_to_output == 0: return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)] elif node.op.what_to_output == 1: return [ host_from_gpu(
'fast_compile', 'fast_run') # Legacy convolution conv_groupopt.register('local_conv2d_cpu', local_conv2d_cpu, 40, 'fast_compile', 'fast_run') conv_groupopt.register('local_conv2d_gradweight_cpu', local_conv2d_gradweight_cpu, 40, 'fast_compile', 'fast_run') conv_groupopt.register('local_conv2d_gradinputs_cpu', local_conv2d_gradinputs_cpu, 40, 'fast_compile', 'fast_run') # Verify that no AbstractConv are present in the graph @local_optimizer( [AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]) def local_abstractconv_check(node): if isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)): raise AssertionError( '%s Theano optimization failed: there is no implementation ' 'available supporting the requested options. Did you exclude ' 'both "conv_dnn" and "conv_gemm" from the optimizer? If on GPU, ' 'is cuDNN available and does the GPU support it? If on CPU, ' 'do you have a BLAS library installed Theano can link against?' % node.op.__class__.__name__) optdb.register('AbstracConvCheck', opt.in2out(local_abstractconv_check, name="AbstractConvCheck"), 48.7, 'fast_compile', 'fast_run')
""" if isinstance(size, tuple): msg = "size must be a tuple of int or a Theano variable" assert all([isinstance(i, int) or isinstance(i, Variable) for i in size]), msg else: msg = "size must be a tuple of int or a Theano variable" assert isinstance(size, Variable) and size.ndim == 1, msg generator = theano.shared(False) # makes a generic s_size = theano.tensor.as_tensor_variable(size) u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size, self.next_seed()) self.state_updates.append(u.update) rval = u * std + avg if u.type.broadcastable != rval.type.broadcastable: raise NotImplementedError( "Increase the size to match the broadcasting pattern of `low`" "and `high` arguments" ) return rval @local_optimizer([CURAND_Base]) def local_destructive(node): op = node.op if isinstance(op, CURAND_Base) and not op.destructive: # op might be gpu version new_op = op.as_destructive() return new_op.make_node(*node.inputs).outputs return False optdb.register("CURAND_destructive", opt.in2out(local_destructive, ignore_newtrees=True), 99, "fast_run", "inplace")
gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() # Don't register this right now conv_groupopt = LocalGroupDB() conv_groupopt.__name__ = "gpua_conv_opts" gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_compile', 'fast_run', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_compile', 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f def register_inplace(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ optdb.register(
from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc from elemwise import GpuElemwise, _is_scalar gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f register_opt()(theano.tensor.opt.local_track_shape_i) class InputToGpuOptimizer(Optimizer): "Transfer the input to the gpu to start the rolling wave." def add_requirements(self, fgraph):
op = node.op if (isinstance(op, IfElse) and not op.as_view and # For big graph, do not make inplace scalar to speed up # optimization. (len(node.fgraph.apply_nodes) < 500 or not all([getattr(o.type, "ndim", -1) == 0 for o in node.outputs]))): return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)(*node.inputs, **dict(return_list=True)) return False optdb.register( "cond_make_inplace", opt.in2out(cond_make_inplace, ignore_newtrees=True), 95, "fast_run", "inplace", ) # XXX: Optimizations commented pending further debugging (certain optimizations # make computation less lazy than it should be currently). # # ifelse_equilibrium = gof.EquilibriumDB() # ifelse_seqopt = gof.SequenceDB() # ifelse_equilibrium.register('seq_ifelse', ifelse_seqopt, 'fast_run', # 'ifelse') """ Comments: I've wrote this comments to explain how the optimization of ifelse function (for future developers that need to parse this part of code. Please try to keep this comments in sync with whatever changes you add to the code.
BidirectionalTwoDLSTMOpGradNoInplaceInstance = BidirectionalTwoDLSTMOpGrad( inplace=False) BidirectionalTwoDLSTMOpGradInplaceInstance = BidirectionalTwoDLSTMOpGrad( inplace=True) BidirectionalTwoDLSTMOpInplaceOpt = OpSub( BidirectionalTwoDLSTMOpGradNoInplaceInstance, BidirectionalTwoDLSTMOpGradInplaceInstance) #hack to avoid being called twice if not hasattr(optdb, 'BidirectionalTwoDLSTMOpInplaceOpt_registered'): optdb.register( 'BidirectionalTwoDLSTMOpInplaceOpt', theano.gof.TopoOptimizer( BidirectionalTwoDLSTMOpInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run', 'inplace', 'gpuarray') optdb.BidirectionalTwoDLSTMOpInplaceOpt_registered = True class BidirectionalTwoDLSTMOp(theano.sandbox.cuda.GpuOp): __props__ = () def __init__(self): super(BidirectionalTwoDLSTMOp, self).__init__() def make_node(self, X, W1, W2, V_h1, V_h2, V_v1, V_v2, b1, b2, sizes): var_names = [ "X", "W1", "W2", "V_h1", "V_h2", "V_v1", "V_v2", "b1", "b2" ]
if len(nw_inner) != len(op_ins): op_outs = scan_utils.clone(op_outs, replace=givens) nw_info = copy.deepcopy(op.info) nw_info['n_seqs'] = nw_n_seqs # DEBUG CHECK nwScan = scan_op.Scan(nw_inner, op_outs, nw_info) nw_outs = nwScan.make_node(*nw_outer).outputs return nw_outs else: return False scan_seqopt = theano.gof.SequenceDB() # We run before blas opt at 1.7 and specialize 2.0 # but after stabilize at 1.5. Should we put it before stabilize? optdb.register('scan_seqopt', scan_seqopt, 1.6, 'fast_run', 'scan') scan_seqopt.register('scanOp_remove_constants_and_unused_inputs', opt.in2out(remove_constants_and_unused_inputs_scan, ignore_newtrees=True), 5, 'fast_run', 'scan') # This is a global opt for historical reason # It should be possible to change it to a local opt. class PushOutNonSeqScan(gof.Optimizer): def __init__(self): gof.Optimizer.__init__(self)
if len(nw_inner) != len(op_ins): op_outs = scan_utils.clone(op_outs, replace=givens) nw_info = op.info.copy() nw_info["n_seqs"] = nw_n_seqs # DEBUG CHECK nwScan = scan_op.Scan(nw_inner, op_outs, nw_info) nw_outs = nwScan.make_node(*nw_outer).outputs return nw_outs else: return False scan_seqopt = theano.gof.SequenceDB() # We run before blas opt at 1.7 and specialize 2.0 # but after stabilize at 1.5. Should we put it before stabilize? optdb.register("scan_seqopt", scan_seqopt, 1.6, "fast_run", "scan") scan_seqopt.register( "scanOp_remove_constants_and_unused_inputs", opt.in2out(remove_constants_and_unused_inputs_scan, ignore_newtrees=True), 5, "fast_run", "scan", ) # This is a global opt for historical reason # It should be possible to change it to a local opt. class PushOutNonSeqScan(gof.Optimizer): def __init__(self): gof.Optimizer.__init__(self)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f register_opt()(theano.tensor.opt.local_track_shape_i)
if len(nw_inner) != len(op_ins): op_outs = scan_utils.clone(op_outs, replace=givens) nw_info = copy.deepcopy(op.info) nw_info["n_seqs"] = nw_n_seqs # DEBUG CHECK nwScan = scan_op.Scan(nw_inner, op_outs, nw_info) nw_outs = nwScan.make_node(*nw_outer).outputs return nw_outs else: return False scan_seqopt = theano.gof.SequenceDB() # We run before blas opt at 1.7 and specialize 2.0 # but after stabilize at 1.5. Should we put it before stabilize? optdb.register("scan_seqopt", scan_seqopt, 1.6, "fast_run", "scan") scan_seqopt.register( "scanOp_remove_constants_and_unused_inputs", opt.in2out(remove_constants_and_unused_inputs_scan, ignore_newtrees=True), 5, "fast_run", "scan", ) # This is a global opt for historical reason # It should be possible to change it to a local opt. class PushOutNonSeqScan(gof.Optimizer): def __init__(self): gof.Optimizer.__init__(self)
0, 1): X = node.inputs[0] if hasattr(X.owner, "op") and type( X.owner.op) == PoolHWBCOpGrad and X.owner.op.inplace: fgraph.replace_validate(node.outputs[0], node.inputs[0]) replace_op = PoolHWBCOpGrad(X.owner.op.pool_shape, inplace=True, BCHW_grad_output=True) fgraph.replace_validate(X.owner.outputs[0], replace_op(*X.owner.inputs)) RemoveConvGradDimshuffleOptimizer = RemoveConvGradDimshuffle() if not hasattr(optdb, 'RemoveConvGradDimshuffleOptimizer_registered'): optdb.register('RemoveConvGradDimshuffle', RemoveConvGradDimshuffleOptimizer, 50.5, 'fast_run', 'inplace', 'gpuarray') optdb.RemoveConvGradDimshuffleOptimizer_registered = True #--------------------------- #for the moment we implement only ignore_border = True and no padding class PoolHWBCOp(theano.sandbox.cuda.GpuOp): __props__ = ("pool_shape", ) def __init__(self, pool_shape): pool_shape = tuple(pool_shape) super(PoolHWBCOp, self).__init__() assert len(pool_shape) == 2, len(pool_shape) assert pool_shape[0] > 0, pool_shape[0]
GpuAdvancedIncSubtensor1_dev20) from theano.sandbox.gpuarray.type import GpuArrayConstant gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_compile', 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_compile', 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) def safe_to_gpu(x): if isinstance(x.type, tensor.TensorType):
def inline_ofg_expansion(node): """ This optimization expands internal graph of OpFromGraph. Only performed if node.op.is_inline == True Doing so can improve optimization at the cost of compilation speed. """ op = node.op if not isinstance(op, OpFromGraph): return False if not op.is_inline: return False return theano.clone( op.local_outputs, {u: v for u, v in zip(node.op.local_inputs, node.inputs)}) # We want to run this before the first merge optimizer # and before the first scan optimizer. optdb.register( "inline_ofg_expansion", gof.opt.in2out(inline_ofg_expansion), -0.01, "fast_compile", "fast_run", ) # Since OpFromGraph contains a Theano compiled function, # we should let DebugMode know about it ops_with_inner_function[OpFromGraph] = "fn"
]) def local_abstractconv_check(node): if isinstance( node.op, ( AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs, AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs, ), ): raise LocalMetaOptimizerSkipAssertionError( "%s Theano optimization failed: there is no implementation " "available supporting the requested options. Did you exclude " 'both "conv_dnn" and "conv_gemm" from the optimizer? If on GPU, ' "is cuDNN available and does the GPU support it? If on CPU, " "do you have a BLAS library installed Theano can link against? " "On the CPU we do not support float16." % node.op.__class__.__name__) optdb.register( "AbstractConvCheck", in2out(local_abstractconv_check, name="AbstractConvCheck"), 48.7, "fast_compile", "fast_run", )
@inplace_allocempty(GpuDnnConvGradI, 2) def local_dnn_convgi_inplace(node, inputs): return [ GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs) ] optdb.register( "local_dnna_conv_inplace", theano.tensor.opt.in2out( local_dnn_conv_inplace, local_dnn_convgw_inplace, local_dnn_convgi_inplace, name="local_dnna_conv_inplace", ), 70.0, "fast_run", "inplace", "gpuarray", "cudnn", ) @register_opt("cudnn") @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5) def local_dnn_conv_alpha_merge(node, *inputs): return [ GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs) ]
@opt.register_opt() @alpha_merge(Gemm16, alpha_in=1, beta_in=4) def local_gemm16_alpha_merge(node, *inputs): return [Gemm16(relu=node.op.relu)(*inputs)] @opt.register_opt() @output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0) def local_gemm16_output_merge(node, *inputs): return [Gemm16(relu=node.op.relu)(*inputs)] @local_optimizer([Gemm16], inplace=True) def local_gemm16_inplace(node): if type(node.op) != Gemm16 or node.op.inplace: return inputs = list(node.inputs) C = inputs[0] if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and len(C.clients) > 1): inputs[0] = C.owner.op(*C.owner.inputs) return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)] optdb.register( 'local_gemm16_inplace', tensor.opt.in2out(local_gemm16_inplace, name='local_gemm16_inplace'), 70.0, 'fast_run', 'inplace', 'gpuarray')
@local_optimizer([GpuDnnConvGradI], inplace=True) def local_dnn_convgi_inplace(node): if type(node.op) != GpuDnnConvGradI or node.op.inplace: return inputs = list(node.inputs) dest = inputs[2] if (dest.owner and isinstance(dest.owner.op, GpuAllocEmpty) and len(dest.clients) > 1): inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)] optdb.register('local_dnna_conv_inplace', tensor.opt.in2out(local_dnn_conv_inplace, local_dnn_convgw_inplace, local_dnn_convgi_inplace, name="local_dnn_conv_inplace"), 70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn') @register_opt('cudnn') @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4) def local_dnn_conv_alpha_merge(node, *inputs): return [GpuDnnConv(algo=node.op.algo)(*inputs)] @register_opt('cudnn') @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4) def local_dnn_convw_alpha_merge(node, *inputs): return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
""" % locals() #!!! change this when changing the code! #def c_code_cache_version(self): # return 2, 7 MultiDirectionalTwoDLSTMOpGradNoInplaceInstance = MultiDirectionalTwoDLSTMOpGrad(inplace=False) MultiDirectionalTwoDLSTMOpGradInplaceInstance = MultiDirectionalTwoDLSTMOpGrad(inplace=True) MultiDirectionalTwoDLSTMOpInplaceOpt = OpSub(MultiDirectionalTwoDLSTMOpGradNoInplaceInstance, MultiDirectionalTwoDLSTMOpGradInplaceInstance) #hack to avoid being called twice if not hasattr(optdb, 'MultiDirectionalTwoDLSTMOpInplaceOpt_registered'): optdb.register('MultiDirectionalTwoDLSTMOpInplaceOpt', theano.gof.TopoOptimizer(MultiDirectionalTwoDLSTMOpInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run', 'inplace', 'gpuarray') optdb.MultiDirectionalTwoDLSTMOpInplaceOpt_registered = True class MultiDirectionalTwoDLSTMOp(theano.sandbox.cuda.GpuOp): __props__ = () def __init__(self): super(MultiDirectionalTwoDLSTMOp, self).__init__() def make_node(self, X, W1, W2, W3, W4, V_h1, V_h2, V_h3, V_h4, V_v1, V_v2, V_v3, V_v4, b1, b2, b3, b4, sizes): var_names = ["X", "W1", "W2", "W3", "W4", "V_h1", "V_h2", "V_h3", "V_h4", "V_v1", "V_v2", "V_v3", "V_v4", "b1", "b2", "b3", "b4"] lcl = locals() for var_name in var_names:
else: return tuple(rval) @gof.local_optimizer([None]) def cond_make_inplace(node): op = node.op if isinstance(op, IfElse) and not op.as_view: return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)(*node.inputs, **dict(return_list=True)) return False optdb.register('cond_make_inplace', opt.in2out(cond_make_inplace, ignore_newtrees=True), 95, 'fast_run', 'inplace') # XXX: Optimizations commented pending further debugging (certain optimizations # make computation less lazy than it should be currently). # # ifelse_equilibrium = gof.EquilibriumDB() # ifelse_seqopt = gof.SequenceDB() # ifelse_equilibrium.register('seq_ifelse', ifelse_seqopt, 'fast_run', # 'ifelse') ''' Comments: I've wrote this comments to explain how the optimization of ifelse function (for future developers that need to parse this part of code. Please try to keep this comments in sync with whatever changes you add to the code. ifelse optimization are registered before canonicalize !
return list(rval) else: return tuple(rval) @gof.local_optimizer([IfElse]) def cond_make_inplace(node): op = node.op if isinstance(op, IfElse) and not op.as_view: return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu, name=op.name)(*node.inputs, **dict(return_list=True)) return False optdb.register('cond_make_inplace', opt.in2out(cond_make_inplace, ignore_newtrees=True), 95, 'fast_run', 'inplace') # XXX: Optimizations commented pending further debugging (certain optimizations # make computation less lazy than it should be currently). # # ifelse_equilibrium = gof.EquilibriumDB() # ifelse_seqopt = gof.SequenceDB() # ifelse_equilibrium.register('seq_ifelse', ifelse_seqopt, 'fast_run', # 'ifelse') ''' Comments: I've wrote this comments to explain how the optimization of ifelse function (for future developers that need to parse this part of code. Please try to keep this comments in sync with whatever changes you add to the code. ifelse optimization are registered before canonicalize !
a = LargeSparseTargets(what_to_output=2).make_node(*fnode.inputs) f, g = a.outputs z = fnode.outputs[0] fgraph.replace_validate(z, f, "replace by a cost+grad op") for gnode in gnodes: z = gnode.outputs[0] fgraph.replace_validate(z, g, "replace by a cost+grad op") mergelst = MergeLargeSparseTargetOps() #optdb['specialize'].register('merge_large_sparse_target_ops', mergelst, 'fast_run') optdb.register("global_large_sparse_targets_merge", mergelst, 48.5, "fast_run") # add CPU TO GPU merge #@register_specialize #@local_optimizer([LargeSparseTargets]) def local_large_sparse_targets_gpu(node): if not isinstance(node.op, LargeSparseTargets) or theano.config.device == "cpu": return False if node.op.what_to_output == 0: return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)] elif node.op.what_to_output == 1: return [host_from_gpu(GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs))] else: out = GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs) return [out[0], host_from_gpu(out[1])]
gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() # Don't register this right now conv_groupopt = LocalGroupDB() conv_groupopt.__name__ = "gpua_conv_opts" gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_compile', 'fast_run', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_compile', 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f def register_inplace(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__
if isinstance(op, RandomFunction) and not op.inplace: # Read op_fn from op.state, not from op.fn, since op.fn # may not be picklable. op_fn, op_outtype, op_inplace, op_ndim_added = op._props() new_op = RandomFunction(op_fn, op_outtype, inplace=True, ndim_added=op_ndim_added) return new_op.make_node(*node.inputs).outputs return False optdb.register( "random_make_inplace", opt.in2out(random_make_inplace, ignore_newtrees=True), 99, "fast_run", "inplace", ) class RandomStreamsBase(object): def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype="int64", prob=None): """ Sample n times with probability of success p for each trial and
#!!! change this when changing the code! def c_code_cache_version(self): return 1, 7 BLSTMOpGradNoInplaceInstance = BLSTMOpGrad(inplace=False) BLSTMOpGradInplaceInstance = BLSTMOpGrad(inplace=True) BLSTMOpGradInplaceOpt = OpSub(BLSTMOpGradNoInplaceInstance, BLSTMOpGradInplaceInstance) #hack to avoid being called twice if not hasattr(optdb, 'BLSTMOpGradInplaceOpt_registered'): optdb.register('BLSTMOpGradInplaceOpt', theano.gof.TopoOptimizer(BLSTMOpGradInplaceOpt), 50.0, 'fast_run', 'inplace', 'gpuarray') optdb.BLSTMOpGradInplaceOpt_registered = True #------------------------ class BLSTMOp(theano.sandbox.cuda.GpuOp): def __init__(self, inplace): self.inplace = inplace if inplace: #all outputs operate inplace on input 0 (which is Z) #but when the input is marked multiple times, we get an error #so we only mark that output 0 destroys input 0 #anyway theano knows that input 0 will be destroyed, so it should be OK #TODO
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.type import GpuArrayConstant gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_run', 'inplace', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt, optdb.__position__.get('add_destroy_handler', 49.5) - 1, 'gpuarray') def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags) return local_opt return f register_opt()(theano.tensor.opt.local_track_shape_i) def op_lifter(OP): """
output[0] = variable.copy() @gof.local_optimizer([OpFromGraph]) def inline_ofg_expansion(node): """ This optimization expands internal graph of OpFromGraph. Only performed if node.op.is_inline == True Doing so can improve optimization at the cost of compilation speed. """ op = node.op if not isinstance(op, OpFromGraph): return False if not op.is_inline: return False return theano.clone( op.local_outputs, { u: v for u, v in izip( node.op.local_inputs, node.inputs)}) # We want to run this before the first merge optimizer # and before the first scan optimizer. optdb.register( 'inline_ofg_expansion', gof.opt.in2out(inline_ofg_expansion), -0.01, 'fast_compile', 'fast_run') # Since OpFromGraph contains a Theano compiled function, # we should let DebugMode know about it ops_with_inner_function[OpFromGraph] = 'fn'
from theano.compile import optdb from theano.gof import local_optimizer, LocalOptGroup from theano.tensor.opt import in2out @local_optimizer([gpugemv_no_inplace], inplace=True) def local_inplace_gpuagemv(node): if node.op == gpugemv_no_inplace: return [gpugemv_inplace(*node.inputs)] @local_optimizer([gpugemm_no_inplace], inplace=True) def local_inplace_gpuagemm(node): if node.op == gpugemm_no_inplace: return [gpugemm_inplace(*node.inputs)] @local_optimizer([gpuger_no_inplace], inplace=True) def local_inplace_gpuager(node): if node.op == gpuger_no_inplace: return [gpuger_inplace(*node.inputs)] gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager), name='gpuablas_opt_inplace') optdb.register('InplaceGpuaBlasOpt', gpuablas_opt_inplace, 70.0, 'fast_run', 'inplace', 'gpuarray')
GpuSoftmax, ) from theano.sandbox.gpuarray.elemwise import GpuElemwise, _is_scalar, GpuDimShuffle, GpuCAReduceCuda from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor from theano.sandbox.gpuarray.type import GpuArrayConstant gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_run", "inplace", "gpuarray") gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_run", "gpuarray") # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray") def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop("name")) or local_opt.__name__ gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags) return local_opt return f register_opt()(theano.tensor.opt.local_track_shape_i) def op_lifter(OP):
_logger = logging.getLogger("theano.sandbox.gpuarray.opt") gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() # Don't register this right now conv_groupopt = LocalGroupDB() conv_groupopt.__name__ = "gpua_conv_opts" gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_compile", "fast_run", "gpuarray") gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_compile", "fast_run", "gpuarray") # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray") def register_opt(*tags, **kwargs): def f(local_opt): name = (kwargs and kwargs.pop("name")) or local_opt.__name__ gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags) return local_opt return f register_opt("fast_compile")(theano.tensor.opt.local_track_shape_i) gpu_optimizer.register("local_remove_all_assert", theano.tensor.opt.local_remove_all_assert, "unsafe")
return final_samples from theano.sandbox.gpuarray.opt import register_opt as register_gpua, host_from_gpu as host_from_gpua @register_gpua() @local_optimizer([mrg_uniform]) def local_gpua_mrg(node): if type(node.op) == mrg_uniform and isinstance(node.inputs[0].type, GpuArrayType): outs = GPUA_mrg_uniform.new(node.inputs[0], node.op.output_type.ndim, node.op.output_type.dtype, node.inputs[1]) return [outs[0], host_from_gpua(outs[1])] MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform) @local_optimizer(MRG_RNGs) def mrg_random_make_inplace(node): op = node.op if isinstance(op, MRG_RNGs) and not op.inplace: # op might be gpu version new_op = op.__class__(op.output_type, inplace=True) return new_op.make_node(*node.inputs).outputs return False optdb.register( "random_make_inplace_mrg", opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, "fast_run", "inplace" )