def make_node(self, fmap, bbox, dy): fmap = as_cuda_ndarray_variable(fmap) bbox = as_cuda_ndarray_variable(bbox) dy = as_cuda_ndarray_variable(dy) assert bbox.ndim == 4 and dy.ndim == 4 return theano.Apply(self, [fmap, bbox, dy], [fmap.type()])
def make_node(self, images, top_down): """ .. todo:: WRITEME """ images = as_cuda_ndarray_variable(images) top_down = as_cuda_ndarray_variable(top_down) assert images.ndim == 4 assert top_down.ndim == 4 channels_broadcastable = images.type.broadcastable[0] batch_broadcastable = images.type.broadcastable[3] rows_broadcastable = False cols_broadcastable = False houtput_broadcastable = (channels_broadcastable, rows_broadcastable, cols_broadcastable, batch_broadcastable) houtput_type = CudaNdarrayType(broadcastable=houtput_broadcastable) houtput = houtput_type() poutput_broadcastable = (channels_broadcastable, rows_broadcastable, cols_broadcastable, batch_broadcastable) poutput_type = CudaNdarrayType(broadcastable=poutput_broadcastable) poutput = poutput_type() return Apply(self, [images, top_down], [houtput, poutput])
def make_node(self, inp1, inp2): inp1 = as_cuda_ndarray_variable(inp1) inp2 = as_cuda_ndarray_variable(inp2) assert inp1.ndim == 2 assert inp2.ndim == 2 return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_node(self, fmap, bbox): fmap = as_cuda_ndarray_variable(fmap) bbox = as_cuda_ndarray_variable(bbox) assert fmap.ndim == 4 assert bbox.ndim == 4 return theano.Apply(self, [fmap, bbox], [fmap.type()])
def make_node(self, output_spike, H_out, weights): if output_spike.type.ndim != 4: raise TypeError('output_spike must be 4D tensor') if H_out.type.ndim != 4: raise TypeError('H_out must be 4D tensor') if weights.type.ndim != 4: raise TypeError('weights must be 4D tensor') # if LR.type.ndim != 1: # raise TypeError('LR must be 1D tensor') # if weight_update.type.ndim != 4: # raise TypeError('weight_update must be 4D tensor') output_spike = as_cuda_ndarray_variable(output_spike) H_out = as_cuda_ndarray_variable(H_out) weights = as_cuda_ndarray_variable(weights) # LR= as_cuda_ndarray_variable(LR) #weight_update = as_cuda_ndarray_variable(weight_update) print 'MAKENODE: ', output_spike.shape, H_out.shape, weights.shape # broadcastable = [output_spike.type.broadcastable[0], H_out.type.broadcastable[0],weights.type.broadcastable[0], # weight_update,False, False, False, False] # otype = CudaNdarrayType(broadcastable=[False] * 4) broadcastable = [False, False, False, False, False] return Apply(self, [output_spike, H_out, weights], [CudaNdarrayType(broadcastable)()])
def make_node(self, initial_state, inp_state, inp_update, inp_reset, state_to_state, state_to_update, state_to_reset): weights = [state_to_state, state_to_update, state_to_reset] batch_size = inp_state.shape[1] assert initial_state.dtype == "float32" assert initial_state.ndim == 1 initial_state = as_cuda_ndarray_variable( tensor.repeat(initial_state[None, :], batch_size, 0)) for i, w in enumerate(weights): weights[i] = as_cuda_ndarray_variable(w) inputs = [inp_state, inp_update, inp_reset] for i, b in enumerate(inputs): inputs[i] = as_cuda_ndarray_variable(b) for w in weights: assert w.dtype == "float32" assert w.ndim == 2 for i in inputs: assert i.dtype == "float32" assert i.ndim == 3 out_type = CudaNdarrayType((False, False)) return theano.Apply(self, [initial_state] + inputs + weights, [out_type()])
def make_node(self, X, DY): X = gpu_contiguous(as_cuda_ndarray_variable(X)) DY = gpu_contiguous(as_cuda_ndarray_variable(DY)) assert X.dtype == "float32" assert DY.dtype == "float32" assert X.ndim == 4 assert DY.ndim == 4 return theano.Apply(self, [X, DY], [X.type()])
def make_node(self, o, x, y, xIdx, yIdx, alpha=None): one = tensor.constant(numpy.asarray(1.0, dtype='float32')) o = basic_ops.as_cuda_ndarray_variable(o) x = basic_ops.as_cuda_ndarray_variable(x) y = basic_ops.as_cuda_ndarray_variable(y) if alpha is None: alpha = one return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])
def make_node(self, o, x, y, xIdx, yIdx, alpha=None): one = tensor.constant(numpy.asarray(1.0, dtype="float32")) o = basic_ops.as_cuda_ndarray_variable(o) x = basic_ops.as_cuda_ndarray_variable(x) y = basic_ops.as_cuda_ndarray_variable(y) if alpha is None: alpha = one return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])
def make_node(self, X, sizes): X = gpu_contiguous(as_cuda_ndarray_variable(X)) sizes = gpu_contiguous(as_cuda_ndarray_variable(sizes)) assert X.dtype == "float32" assert X.ndim == 4 assert sizes.dtype == "float32" assert sizes.ndim == 2 return theano.Apply(self, [X, sizes], [X.type()])
def make_node(self, x, b, y_idx): # N.B. won't work when we don't cast y_idx to float anymore x = as_cuda_ndarray_variable(x) b = as_cuda_ndarray_variable(b) y_idx = as_cuda_ndarray_variable(y_idx) nll = y_idx.type() sm = x.type() am = y_idx.type() return Apply(self, [x, b, y_idx], [nll, sm, am])
def make_node(self, inp1, inp2): inp1 = as_cuda_ndarray_variable(inp1) inp2 = as_cuda_ndarray_variable(inp2) assert inp1.ndim == 2 assert inp2.ndim == 2 return theano.Apply( self, [inp1, inp2], [CudaNdarrayType(broadcastable=[False] * inp1.type.ndim)()])
def h_softmax_gpu(W1, b1, W2, b2, x, n_outputs, n_classes, n_outputs_per_class, batch_size, target=None): """ GPU-only version of a two-layer hierarchical softmax. See hierarchical_softmax's docstring for the description of the arguments. """ W1 = as_cuda_ndarray_variable(W1) b1 = as_cuda_ndarray_variable(b1) W2 = as_cuda_ndarray_variable(W2) b2 = as_cuda_ndarray_variable(b2) x = as_cuda_ndarray_variable(x) # First softmax which computes the probabilities of belonging to each class class_probs = tensor.nnet.softmax(tensor.dot(x, W1) + b1) if target is None: # Computes the probabilites of all the outputs class_ids = tensor.tile(tensor.arange(n_classes, dtype="int32")[None, :], (batch_size, 1)) # Second softmax that computes the output probabilities activations = sparse_block_dot_SS( W2[None, :, :, :], x[:, None, :], tensor.zeros((batch_size, 1), dtype='int32'), b2, class_ids) output_probs = tensor.nnet.softmax(activations.reshape((-1, n_outputs_per_class))) output_probs = output_probs.reshape((batch_size, n_classes, -1)) output_probs = class_probs[:, :, None] * output_probs output_probs = output_probs.reshape((batch_size, -1)) output_probs = output_probs[:, :n_outputs] else: # Computes the probabilities of the outputs specified by the targets # Flattens the targets target = target.flatten() # Classes to which belong each target target_classes = target // n_outputs_per_class # Outputs to which belong each target inside a class target_outputs_in_class = target % n_classes # Second softmax that computes the output probabilities activations = sparse_block_dot_SS( W2[None, :, :, :], x[:, None, :], tensor.zeros((batch_size, 1), dtype='int32'), b2, target_classes[:, None]) output_probs = tensor.nnet.softmax(activations[:, 0, :]) target_class_probs = class_probs[tensor.arange(batch_size), target_classes] output_probs = output_probs[tensor.arange(batch_size), target_outputs_in_class] output_probs = target_class_probs * output_probs return output_probs
def local_gpu_conv3d(node): if isinstance(node.op, Conv3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, W, b, d = node.inputs return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V), as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))]
def make_node(self, inp1, inp2): inp1 = basic_ops.gpu_contiguous(basic_ops.as_cuda_ndarray_variable(inp1)) inp2 = basic_ops.gpu_contiguous(basic_ops.as_cuda_ndarray_variable(inp2)) assert inp1.dtype == "float32" assert inp2.dtype == "float32" assert inp1.ndim == 4 # (batch, a, b, real/imag) assert inp2.ndim == 4 return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_node(self, X, W, b): X = gpu_contiguous(as_cuda_ndarray_variable(X)) W = gpu_contiguous(as_cuda_ndarray_variable(W)) b = gpu_contiguous(as_cuda_ndarray_variable(b)) assert X.dtype == "float32" assert W.dtype == "float32" assert b.dtype == "float32" assert X.ndim == 4 assert W.ndim == 4 assert b.ndim == 1 return theano.Apply(self, [X, W, b], [X.type()])
def local_gpu_conv_grad3d(node): if isinstance(node.op, ConvGrad3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, d, WShape, dCdH = node.inputs return [host_from_gpu(gpu_conv_grad3d( as_cuda_ndarray_variable(V), d, WShape, as_cuda_ndarray_variable(dCdH)))]
def make_node(self, img, kern): img = as_cuda_ndarray_variable(img) kern = as_cuda_ndarray_variable(kern) if img.type.ndim != 4: raise TypeError('img must be 4D tensor') if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], False, False] return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def make_node(self, inp1, inp2): inp1 = basic_ops.gpu_contiguous( basic_ops.as_cuda_ndarray_variable(inp1)) inp2 = basic_ops.gpu_contiguous( basic_ops.as_cuda_ndarray_variable(inp2)) assert inp1.dtype == "float32" assert inp2.dtype == "float32" assert inp1.ndim == 4 # (batch, a, b, real/imag) assert inp2.ndim == 4 return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_node(self, V, U, UinvT, Q, H, Y_indexes, Y_values, learning_rate, use_qtilde=0, use_lower=1, invup_mode=1, stabilize_period=10, unfactorize_period=100,debug_print=0): # The following are supposed to reside on the GPU V = as_cuda_ndarray_variable(V) U = as_cuda_ndarray_variable(U) UinvT = as_cuda_ndarray_variable(UinvT) Q = as_cuda_ndarray_variable(Q) H = as_cuda_ndarray_variable(H) # The following are on the CPU Y_indexes = as_tensor_variable(Y_indexes) Y_values = as_tensor_variable(Y_values) learning_rate = as_tensor_variable(learning_rate) use_qtilde = as_tensor_variable(use_qtilde) use_lower = as_tensor_variable(use_lower) invup_mode = as_tensor_variable(invup_mode) stabilize_period = as_tensor_variable(stabilize_period) unfactorize_period = as_tensor_variable(unfactorize_period) debug_print = as_tensor_variable(debug_print) # print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" # for k,v in locals().items(): # print k,':',type(v) # print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" params = [V, U, UinvT, Q, H, Y_indexes, Y_values, learning_rate, use_qtilde, use_lower, invup_mode, stabilize_period, unfactorize_period, debug_print] # make sure parameters are either all of dtype float32 or all of dtype float64 (except for Y_indexes which are integers) elem_type = V.dtype if elem_type != "float32" and elem_type != "float64": raise TypeError("LargeSparseTargets parameter V must have dtype of float32 or float64") check_tensor_variables_ndim_and_dtype(0, elem_type, ["learning_rate"], locals() ) check_tensor_variables_ndim_and_dtype(2, elem_type, ["V", "U", "UinvT", "Q", "H", "Y_values"], locals() ) check_tensor_variables_ndim_and_dtype(2, "int32", ["Y_indexes"], locals() ) # T.matrix(elem_type) # Now properly set up outputs to compute if self.what_to_output==0: # output scalar cost outputs = [ T.scalar(elem_type) ] elif self.what_to_output==1: # output grad_H outputs = [ CudaNdarrayType(broadcastable=(False,False))() ] elif self.what_to_output==2: # output cost and grad_H outputs = [ T.scalar(elem_type), CudaNdarrayType(broadcastable=(False,False))() ] else: raise ValueError("Invalid value for what_to_output: must be 0,1, or 2") return Apply(self, params, outputs)
def make_node(self, W, b, d, H, RShape=None): W_ = as_cuda_ndarray_variable(W) b_ = as_cuda_ndarray_variable(b) d_ = T.as_tensor_variable(d) H_ = as_cuda_ndarray_variable(H) if RShape: RShape_ = T.as_tensor_variable(RShape) else: RShape_ = T.as_tensor_variable([-1, -1, -1]) return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_], outputs=[CudaNdarrayType(dtype=H_.dtype, broadcastable=(False,)*5)()])
def make_node(self, V, d, WShape, dCdH): """ :param V: visible :param d: strides :param WShape: shapes of the weights -> shape of this op output :param dCdH: other input with what V will be convolved. """ V_ = as_cuda_ndarray_variable(V) d_ = T.as_tensor_variable(d) WShape_ = T.as_tensor_variable(WShape) dCdH_ = as_cuda_ndarray_variable(dCdH) return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
def make_node(self, o, W, h, inputIdx, outputIdx): o = basic_ops.as_cuda_ndarray_variable(o) W = basic_ops.as_cuda_ndarray_variable(W) h = basic_ops.as_cuda_ndarray_variable(h) assert o.ndim == 3 assert W.ndim == 4 assert h.ndim == 3 assert inputIdx.ndim == 2 assert outputIdx.ndim == 2 assert inputIdx.type.dtype in discrete_dtypes assert outputIdx.type.dtype in discrete_dtypes return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])
def make_node(self, X, regions_y, regions_x, out_size): X = gpu_contiguous(as_cuda_ndarray_variable(X)) assert X.dtype == "float32" assert X.ndim == 4 regions_y = gpu_contiguous(as_cuda_ndarray_variable(regions_y)) assert regions_y.dtype == "float32" assert regions_y.ndim == 2 regions_x = gpu_contiguous(as_cuda_ndarray_variable(regions_x)) assert regions_x.dtype == "float32" assert regions_x.ndim == 2, regions_x.ndim out_size = T.as_tensor_variable(out_size) assert out_size.dtype == "float32" assert out_size.ndim == 1 return theano.Apply(self, [X, regions_y, regions_x, out_size], [X.type()])
def make_node(self, V, W, b, d): """ :param V: Visible unit, input :param W: Weights, filter :param b: bias :param d: strides when moving the filter over the input """ V_ = as_cuda_ndarray_variable(V) W_ = as_cuda_ndarray_variable(W) b_ = as_cuda_ndarray_variable(b) d_ = T.as_tensor_variable(d) return theano.Apply(self, inputs=[V_, W_, b_, d_], outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
def make_node(self, X, DY, regions_y, regions_x): X = gpu_contiguous(as_cuda_ndarray_variable(X)) assert X.dtype == "float32" assert X.ndim == 4 DY = gpu_contiguous(as_cuda_ndarray_variable(DY)) assert DY.dtype == "float32" assert DY.ndim == 4 regions_y = gpu_contiguous(as_cuda_ndarray_variable(regions_y)) assert regions_y.dtype == "float32" assert regions_y.ndim == 2 regions_x = gpu_contiguous(as_cuda_ndarray_variable(regions_x)) assert regions_x.dtype == "float32" assert regions_x.ndim == 2, regions_x.ndim return theano.Apply(self, [X, DY, regions_y, regions_x], [X.type()])
def local_gpu_conv3d(node): if isinstance(node.op, Conv3D): if numpy.any([ i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs ]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, W, b, d = node.inputs return [ host_from_gpu( gpu_convd(as_cuda_ndarray_variable(V), as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d)) ]
def make_node(self, Z, V_h, c, i): Z = gpu_contiguous(as_cuda_ndarray_variable(Z)) V_h = gpu_contiguous(as_cuda_ndarray_variable(V_h)) c = gpu_contiguous(as_cuda_ndarray_variable(c)) i = gpu_contiguous(as_cuda_ndarray_variable(i)) assert Z.dtype == "float32" assert V_h.dtype == "float32" assert c.dtype == 'float32' assert c.ndim == 2 assert Z.ndim == 2 assert i.ndim == 1 assert V_h.ndim == 2 #results: output Y, (gates and cell state) H return theano.Apply(self, [Z, V_h, c, i], [Z.type(), Z.type(), c.type()])
def make_node(self, V_f, V_b, c_f, c_b, idx_f, idx_b, Dd_f, Dd_b, DY_f, DY_b, Y_f, Y_b, H_f, H_b): V_f = gpu_contiguous(as_cuda_ndarray_variable(V_f)) V_b = gpu_contiguous(as_cuda_ndarray_variable(V_b)) c_f = gpu_contiguous(as_cuda_ndarray_variable(c_f)) c_b = gpu_contiguous(as_cuda_ndarray_variable(c_b)) DY_f = gpu_contiguous(as_cuda_ndarray_variable(DY_f)) DY_b = gpu_contiguous(as_cuda_ndarray_variable(DY_b)) idx_f = gpu_contiguous( as_cuda_ndarray_variable(T.cast(idx_f, 'float32'))) idx_b = gpu_contiguous( as_cuda_ndarray_variable(T.cast(idx_b, 'float32'))) Dd_f = gpu_contiguous(as_cuda_ndarray_variable(Dd_f)) Dd_b = gpu_contiguous(as_cuda_ndarray_variable(Dd_b)) assert V_f.dtype == "float32" assert V_b.dtype == "float32" assert DY_f.dtype == 'float32' assert DY_b.dtype == 'float32' assert Y_f.dtype == 'float32' assert Y_b.dtype == 'float32' assert H_f.dtype == 'float32' assert H_b.dtype == 'float32' assert c_f.dtype == 'float32' assert c_b.dtype == 'float32' assert V_f.ndim == 2 assert V_b.ndim == 2 assert DY_f.ndim == 3 assert DY_b.ndim == 3 assert Y_f.ndim == 3 assert Y_b.ndim == 3 assert H_f.ndim == 3 assert H_b.ndim == 3 assert c_f.ndim == 2 assert c_b.ndim == 2 assert idx_f.ndim == 2 assert idx_b.ndim == 2 return theano.Apply(self, [ V_f, V_b, c_f, c_b, idx_f, idx_b, Dd_f, Dd_b, DY_f, DY_b, Y_f, Y_b, H_f, H_b ], [ H_f.type(), H_b.type(), V_f.type(), V_b.type(), c_f.type(), c_b.type() ])
def grad(self, inputs, dout): images, = inputs acts, denoms = self(images) dout, _ = dout # Ignore the gradient on "denoms" dout = as_cuda_ndarray_variable(dout) grad_op = CrossMapNormUndo(self._size_f, self._add_scale, self._pow_scale, self._blocked, inplace=False) return [grad_op(images, acts, denoms, dout)[0]]
def make_node(self, inp): inp = basic_ops.gpu_contiguous( basic_ops.as_cuda_ndarray_variable(inp)) assert inp.dtype == "float32" return theano.Apply(self, [inp], [self.output_type(inp)()])
def make_node(self, X, W1, W2, W3, W4, V_h1, V_h2, V_h3, V_h4, V_v1, V_v2, V_v3, V_v4, b1, b2, b3, b4, sizes): var_names = ["X", "W1", "W2", "W3", "W4", "V_h1", "V_h2", "V_h3", "V_h4", "V_v1", "V_v2", "V_v3", "V_v4", "b1", "b2", "b3", "b4"] lcl = locals() for var_name in var_names: lcl[var_name] = gpu_contiguous(as_cuda_ndarray_variable(lcl[var_name])) assert lcl[var_name].dtype == "float32" #note: sizes lives on the CPU! sizes = T.as_tensor_variable(sizes) assert sizes.dtype == "float32" assert lcl["X"].ndim == 4 assert lcl["W1"].ndim == 2 assert lcl["W2"].ndim == 2 assert lcl["W3"].ndim == 2 assert lcl["W4"].ndim == 2 assert lcl["V_h1"].ndim == 2 assert lcl["V_h2"].ndim == 2 assert lcl["V_h3"].ndim == 2 assert lcl["V_h4"].ndim == 2 assert lcl["V_v1"].ndim == 2 assert lcl["V_v2"].ndim == 2 assert lcl["V_v3"].ndim == 2 assert lcl["V_v4"].ndim == 2 assert lcl["b1"].ndim == 1 assert lcl["b2"].ndim == 1 assert lcl["b3"].ndim == 1 assert lcl["b4"].ndim == 1 assert sizes.ndim == 2 all_vars = [lcl[var_name] for var_name in var_names] + [sizes] #results: outputs Y1, Y2, Y3, Y4, (gates and cell states) H1, H2, H3, H4 return theano.Apply(self, all_vars, [lcl["X"].type() for _ in xrange(8)])
def make_node(self, X, W1, W2, W3, W4, V_h1, V_h2, V_h3, V_h4, V_v1, V_v2, V_v3, V_v4, b1, b2, b3, b4, sizes, DY1, DY2, DY3, DY4, Y1, Y2, Y3, Y4, H1, H2, H3, H4): var_names = ["X", "W1", "W2", "W3", "W4", "V_h1", "V_h2", "V_h3", "V_h4", "V_v1", "V_v2", "V_v3", "V_v4", "b1", "b2", "b3", "b4", "DY1", "DY2", "DY3", "DY4", "Y1", "Y2", "Y3", "Y4", "H1", "H2", "H3", "H4"] lcl = locals() for var_name in var_names: lcl[var_name] = gpu_contiguous(as_cuda_ndarray_variable(lcl[var_name])) assert lcl[var_name].dtype == "float32" #note: sizes lives on the CPU! sizes = T.as_tensor_variable(sizes) assert sizes.dtype == "float32" expected_ndims = [4] + ([2] * 12) + ([1] * 4) + ([4] * 12) assert len(var_names) == len(expected_ndims), (len(var_names), len(expected_ndims)) for var_name, expected_ndim in zip(var_names, expected_ndims): assert lcl[var_name].ndim == expected_ndim, \ (var_name, lcl[var_name].name, lcl[var_name].ndim, expected_ndim) assert sizes.ndim == 2 all_vars_no_sizes = [lcl[var_name] for var_name in var_names] all_vars = all_vars_no_sizes[:17] + [sizes] + all_vars_no_sizes[17:] inputs_vars = all_vars[:17] return theano.Apply(self, all_vars, [v.type() for v in inputs_vars])
def make_node(self, p, h, gp, gh): p = as_cuda_ndarray_variable(p) h = as_cuda_ndarray_variable(h) gp = as_cuda_ndarray_variable(gp) gh = as_cuda_ndarray_variable(gh) assert p.ndim == 4 assert h.ndim == 4 assert gp.ndim == 4 assert gh.ndim == 4 try: nb_channel = int(get_scalar_constant_value(h.shape[0])) assert nb_channel % 16 == 0 except NotScalarConstantError: pass return Apply(self, [p, h, gp, gh], [p.type(), h.type()])
def make_node(self, *inputs): _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs] if self.nin > 0 and len(_inputs) != self.nin: raise TypeError('Wrong argument count', (self.nin, len(_inputs))) for i in _inputs[1:]: if i.type.ndim != inputs[0].type.ndim: raise TypeError('different ranks among inputs') if any([any(i.type.broadcastable) for i in inputs]): raise Exception("pycuda don't support broadcasted dimensions") assert len(inputs)==2#TODO remove otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim) assert self.nout == 1 fct_name = "pycuda_elemwise_%s"%str(self.scalar_op) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) in_name = ["i"+str(id) for id in range(len(inputs))] out_name = ["o"+str(id) for id in range(self.nout)] c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {}) c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"]) mod = SourceModule(""" #include<Python.h> #include <numpy/arrayobject.h> __global__ void %s(%s) { int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y); i += threadIdx.x + threadIdx.y*blockDim.x; if(i<size){ %s } } """%(fct_name,c_code_param,c_code)) self.pycuda_fct = mod.get_function(fct_name) return out_node
def make_node(self, images, maxout, gz): images = as_cuda_ndarray_variable(images) maxout = as_cuda_ndarray_variable(maxout) gz = as_cuda_ndarray_variable(gz) assert images.ndim == 4 assert maxout.ndim == 4 assert gz.ndim == 4 try: # Note : `get_scalar_constant_value` returns a ndarray not a # int nb_channel = int(get_scalar_constant_value(images.shape[0])) assert nb_channel % 16 == 0 except NotScalarConstantError: pass return Apply(self, [images, maxout, gz], [images.type()])
def make_node(self, X, W1, W2, V_h1, V_h2, V_v1, V_v2, b1, b2, sizes): var_names = [ "X", "W1", "W2", "V_h1", "V_h2", "V_v1", "V_v2", "b1", "b2" ] lcl = locals() for var_name in var_names: lcl[var_name] = gpu_contiguous( as_cuda_ndarray_variable(lcl[var_name])) assert lcl[var_name].dtype == "float32" #note: sizes lives on the CPU! sizes = T.as_tensor_variable(sizes) assert sizes.dtype == "float32" assert lcl["X"].ndim == 4 assert lcl["W1"].ndim == 2 assert lcl["W2"].ndim == 2 assert lcl["V_h1"].ndim == 2 assert lcl["V_h2"].ndim == 2 assert lcl["V_v1"].ndim == 2 assert lcl["V_v2"].ndim == 2 assert lcl["b1"].ndim == 1 assert lcl["b2"].ndim == 1 assert sizes.ndim == 2 all_vars = [lcl[var_name] for var_name in var_names] + [sizes] #results: outputs Y1, Y2, Y3, Y4, (gates and cell states) H1, H2, H3, H4 return theano.Apply(self, all_vars, [lcl["X"].type() for _ in range(4)])
def make_node(self, img, kern, desc): img = as_cuda_ndarray_variable(img) kern = as_cuda_ndarray_variable(kern) if img.type.ndim != 4: raise TypeError('img must be 4D tensor') if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') if not isinstance(desc.type, CDataType) \ or desc.type.ctype != 'cudnnConvolutionDescriptor_t': raise TypeError('desc must be cudnnConvolutionDescriptor_t') broadcastable = (img.type.broadcastable[0], kern.type.broadcastable[0], False, False) return Apply(self, [img, kern, desc], [CudaNdarrayType(broadcastable)()])
def make_node(self, V, W, b, d): """ :param V: Visible unit, input :param W: Weights, filter :param b: bias :param d: strides when moving the filter over the input """ V_ = as_cuda_ndarray_variable(V) W_ = as_cuda_ndarray_variable(W) b_ = as_cuda_ndarray_variable(b) d_ = T.as_tensor_variable(d) broad = (V_.broadcastable[0], W_.broadcastable[0], False, False, False) return theano.Apply( self, inputs=[V_, W_, b_, d_], outputs=[CudaNdarrayType(dtype=V_.dtype, broadcastable=broad)()])
def grad(self, inputs, g_outputs): """ .. todo:: WRITEME """ hid_acts, filters, output_shape = inputs g_images, = g_outputs g_images = as_cuda_ndarray_variable(g_images) assert not isinstance(g_images, list) global FilterActs global WeightActs if FilterActs is None: from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs from pylearn2.sandbox.cuda_convnet.weight_acts import WeightActs g_filters = WeightActs(stride=self.stride, partial_sum=self.partial_sum, pad=self.pad)( g_images, hid_acts, filters.shape[1:3])[0] assert not isinstance(g_filters, list) g_hid_acts = FilterActs(stride=self.stride, pad=self.pad, partial_sum=self.partial_sum)(g_images, filters) return [g_hid_acts, g_filters, DisconnectedType()()]
def local_softmax_dnn(node): raise_no_dnn() if isinstance(node.op, GpuSoftmax): ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x') out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(gpu_contiguous(ins)) out = as_cuda_ndarray_variable(out.dimshuffle(0, 1)) return [out]
def make_node(self, X, W1, W2, V_h1, V_h2, V_v1, V_v2, b1, b2, sizes, DY1, DY2, Y1, Y2, H1, H2): var_names = [ "X", "W1", "W2", "V_h1", "V_h2", "V_v1", "V_v2", "b1", "b2", "DY1", "DY2", "Y1", "Y2", "H1", "H2" ] lcl = locals() for var_name in var_names: lcl[var_name] = gpu_contiguous( as_cuda_ndarray_variable(lcl[var_name])) assert lcl[var_name].dtype == "float32" #note: sizes lives on the CPU! sizes = T.as_tensor_variable(sizes) assert sizes.dtype == "float32" expected_ndims = [4] + ([2] * 6) + ([1] * 2) + ([4] * 6) assert len(var_names) == len(expected_ndims), (len(var_names), len(expected_ndims)) for var_name, expected_ndim in zip(var_names, expected_ndims): assert lcl[var_name].ndim == expected_ndim, \ (var_name, lcl[var_name].name, lcl[var_name].ndim, expected_ndim) assert sizes.ndim == 2 all_vars_no_sizes = [lcl[var_name] for var_name in var_names] all_vars = all_vars_no_sizes[:9] + [sizes] + all_vars_no_sizes[9:] inputs_vars = all_vars[:9] return theano.Apply(self, all_vars, [v.type() for v in inputs_vars])
def make_node(self, kern, topgrad, desc): kern = as_cuda_ndarray_variable(kern) topgrad = as_cuda_ndarray_variable(topgrad) if kern.type.ndim != 4: raise TypeError('kern must be 4D tensor') if topgrad.type.ndim != 4: raise TypeError('topgrad must be 4D tensor') if not isinstance(desc.type, CDataType) \ or desc.type.ctype != 'cudnnConvolutionDescriptor_t': raise TypeError('desc must be cudnnConvolutionDescriptor_t') broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1], False, False] return Apply(self, [kern, topgrad, desc], [CudaNdarrayType(broadcastable)()])