def local_large_sparse_targets_gpu(node): if not isinstance(node.op, LargeSparseTargets) or theano.config.device == "cpu": return False if node.op.what_to_output == 0: return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)] elif node.op.what_to_output == 1: return [host_from_gpu(GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs))] else: out = GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs) return [out[0], host_from_gpu(out[1])]
def local_large_sparse_targets_gpu(node): if not isinstance(node.op, LargeSparseTargets) or theano.config.device == "cpu": return False if node.op.what_to_output == 0: return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)] elif node.op.what_to_output == 1: return [ host_from_gpu( GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)) ] else: out = GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs) return [out[0], host_from_gpu(out[1])]
def use_gpu_images2neibs(node): if type(node.op) is Images2Neibs: return [ host_from_gpu( gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode) ) ]
def local_gpu_minres(node): if isinstance(node.op, MinresQLP): sw = False for inp in node.inputs: if inp.owner and inp.owner.op == host_from_gpu: sw = True if sw: inps = node.inputs nw_inps = [] for inp in inps: if not isinstance(inp.type, CudaNdarrayType): nw_inps.append(gpu_from_host(inp)) else: nw_inps.append(inp) new_op = node.op new_op.gpu = 1 _new_outs = node.op(*nw_inps) new_outs = [] for out in _new_outs: if isinstance(out.type, CudaNdarrayType): new_outs.append(host_from_gpu(out)) else: new_outs.append(out) return new_outs else: return False
def use_gpu_images2neibs(node): if (type(node.op) is Images2Neibs and node.inputs[0].dtype == 'float32' and node.op.mode in ['valid', 'wrap_centered']): return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode))]
def local_gpu_multinomial(node): if type(node.op) is MultinomialFromUniform: p, u = node.inputs m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([ i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs ])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [ host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T ] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner p, u = multi.inputs m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T # The dimshuffle is on the cpu, but will be moved to the gpu by an opt. return [gpu_from_host(ret)]
def use_gpu_cumsum(node): if type(node.op) is CumOp \ and node.inputs[0].dtype == 'float32' \ and node.inputs[0].owner \ and isinstance(node.inputs[0].owner.op, HostFromGpu): if node.op.mode != 'add': return None axis = node.op.axis x = node.inputs[0] if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: return None x = gpu_from_host(x) if axis is None and x.ndim > 1: x = gpu_flatten(x) # ``gpu_cumsum`` assume array has been flattened if needed. if axis is None: axis = 0 ret = host_from_gpu(GpuCumsum(axis)(x)) ret.tag.values_eq_approx = values_eq_approx_high_tol return [ret]
def use_gpu_images2neibs(node): if (type(node.op) is Images2Neibs and node.inputs[0].dtype == 'float32' and node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']): return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode))]
def local_gpu_advanced_incsubtensor1_scal_floats(node): supported_dims = { # x.ndim, y.ndim (1, 0): GpuAdvancedIncSubtensor1Floats_scal_dev20, (2, 2): GpuAdvancedIncSubtensor1Floats_dev20, } if isinstance(node.op, GpuFromHost): host_input = node.inputs[0] # Should not execute for GpuAdvancedIncSubtensor1 if host_input.owner and \ host_input.owner.op.__class__ is AdvancedIncSubtensor1Floats: x, y = host_input.owner.inputs[0:2] dims = (x.ndim, y.ndim) if dims not in supported_dims.keys(): return False coords = host_input.owner.inputs[2:] set_instead_of_inc = host_input.owner.op.set_instead_of_inc inplace = host_input.owner.op.inplace gpu_op = supported_dims[dims]( inplace=inplace, set_instead_of_inc=set_instead_of_inc) return [ gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords) ] # Should not execute for GpuAdvancedIncSubtensor1 if (node.op.__class__ is AdvancedIncSubtensor1Floats and node.inputs[0].dtype == "float32" and node.inputs[1].dtype == "float32" and node.inputs[2].dtype == "float32"): x, y = node.inputs[0:2] dims = (x.ndim, y.ndim) if dims not in supported_dims: return False coords = node.inputs[2:] go_gpu = False if x.owner and isinstance(x.owner.op, HostFromGpu): go_gpu = True gpu_x, = x.owner.inputs else: gpu_x = as_cuda_ndarray_variable(x) if y.owner and isinstance(y.owner.op, HostFromGpu): go_gpu = True gpu_y, = y.owner.inputs else: gpu_y = as_cuda_ndarray_variable(y) if go_gpu: set_instead_of_inc = node.op.set_instead_of_inc inplace = node.op.inplace gpu_op = supported_dims[dims]( inplace=inplace, set_instead_of_inc=set_instead_of_inc) return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))] return False
def use_gpu_images2neibs(node): if type(node.op) is Images2Neibs: return [ host_from_gpu( gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode)) ]
def local_gpu_conv3d(node): if isinstance(node.op, Conv3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, W, b, d = node.inputs return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V), as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))]
def local_gpu_argmax(node): if type(node.op) is KArgmax: p, = node.inputs vals, indx, = node.outputs if (p.dtype == vals.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuKArgmax(node.op.K) ret_vals, ret_indx = gpu_op(gpu_from_host(p)) return [host_from_gpu(ret_vals), T.cast(host_from_gpu(ret_indx), "int32")] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is KArgmax): multi = node.inputs[0].owner p, = multi.inputs vals, indx, = multi.outputs if (p.dtype == vals.dtype == 'float32'): gpu_op = GpuKArgmax(node.inputs[0].owner.op.K) ret_vals, ret_indx = gpu_op(gpu_from_host(p)) return [gpu_from_host(ret_vals), gpu_from_host(ret_indx)]
def save_data(self, filename, data): if type(data) != type(np.asarray([])): data = host_from_gpu(data) data = np.asarray(data.eval()) mult = lambda x, y: x * y length = reduce(mult, data.shape) data = data.reshape(length) data = "\n".join([str(i) for i in data]) f = open(filename, "w") f.write(data) f.close()
def use_gpu_images2neibs(node): if ( type(node.op) is Images2Neibs and node.inputs[0].dtype == "float32" and node.op.mode in ["valid", "ignore_borders", "wrap_centered"] ): return [ host_from_gpu( gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode) ) ]
def save_weights(weights, filename): """ Taken from the convnet code. Deals with network calculated on a gpu """ length = reduce(lambda x, y: x * y, weights.shape.eval()) data = host_from_gpu(weights).eval() data = np.asarray(data) data = data.reshape(length) data = "\n".join([str(i) for i in data]) f = open(filename, "w") f.write(data) f.close()
def save_data(self, filename, data, gpu = False): mult = lambda x, y: x * y if gpu: length = reduce(mult, data.shape.eval()) data = host_from_gpu(data).eval() data = np.asarray(data) else: length = reduce(mult, data.shape) data = data.reshape(length) data = "\n".join([str(i) for i in data]) f = open(filename, "w") f.write(data) f.close()
def save_data(self, filename, data, gpu=False): mult = lambda x, y: x * y if gpu: length = reduce(mult, data.shape.eval()) data = host_from_gpu(data).eval() data = np.asarray(data) else: length = reduce(mult, data.shape) data = data.reshape(length) data = "\n".join([str(i) for i in data]) f = open(filename, "w") f.write(data) f.close()
def local_gpu_conv3d(node): if isinstance(node.op, Conv3D): if numpy.any([ i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs ]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, W, b, d = node.inputs return [ host_from_gpu( gpu_convd(as_cuda_ndarray_variable(V), as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d)) ]
def grab_cpu_scalar(v, nd): if v.owner is not None: n = v.owner if (isinstance(n.op, GpuDimShuffle) and n.op.new_order == ('x', ) * nd): return host_from_gpu(n.inputs[0]) elif (isinstance(n.op, DimShuffle) and n.op.new_order == ('x', ) * nd): return n.inputs[0] elif isinstance(n.op, GpuFromHost): return grab_cpu_scalar(n.inputs[0], nd=nd) else: return None else: if (isinstance(v, Constant) and v.broadcastable == (True, ) * nd): return v.dimshuffle(())
def grab_cpu_scalar(v, nd): if v.owner is not None: n = v.owner if (isinstance(n.op, GpuDimShuffle) and n.op.new_order == ('x',) * nd): return host_from_gpu(n.inputs[0]) elif (isinstance(n.op, DimShuffle) and n.op.new_order == ('x',) * nd): return n.inputs[0] elif isinstance(n.op, GpuFromHost): return grab_cpu_scalar(n.inputs[0], nd=nd) else: return None else: if (isinstance(v, Constant) and v.broadcastable == (True,) * nd): return v.dimshuffle(())
def local_gpu_forloop(node): if isinstance(node.op, forloop): sw = False for inp in node.inputs: if inp.owner and inp.owner.op == host_from_gpu: sw = True if sw: inps = node.inputs nw_inps = [] for inp in inps: if not isinstance(inp.type, CudaNdarrayType): nw_inps.append(gpu_from_host(inp)) else: nw_inps.append(inp) new_outs = node.op(*nw_inps) return [host_from_gpu(x) for x in new_outs] else: return False
def local_gpu_multinomial(node): # TODO : need description for function if type(node.op) is MultinomialFromUniform: if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([ i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs ])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [ host_from_gpu(gpu_op(*[gpu_from_host(i) for i in [p, u]])).T ] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T # The dimshuffle is on the cpu, but will be moved to the # gpu by an opt. return [gpu_from_host(ret)]
def local_gpu_multinomial(node): if type(node.op) is MultinomialFromUniform: p, u = node.inputs m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner p, u = multi.inputs m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T # The dimshuffle is on the cpu, but will be moved to the gpu by an opt. return [gpu_from_host(ret)]
def local_assigner(node): if type(node.op) is Assigner: p, indx, gr, = node.inputs vals, = node.outputs if (p.dtype == vals.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuAssigner() ret = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr)) return [host_from_gpu(ret),] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is Assigner): multi = node.inputs[0].owner p,indx,gr = multi.inputs vals, = multi.outputs if (p.dtype == vals.dtype == 'float32'): gpu_op = GpuAssigner() ret_vals = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr)) return [gpu_from_host(ret_vals)]
def local_gpu_multinomial(node): # TODO : need description for function if type(node.op) is MultinomialFromUniform: if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in [p, u]])).T] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T # The dimshuffle is on the cpu, but will be moved to the # gpu by an opt. return [gpu_from_host(ret)]
def local_gpu_join_unsafe(node): """ Inspired by the opt for convop. Very loose notation follows. Subgraphs concerned first look like [array of HostTensor] -> HostToGpu -> GpuToHost -> Join -> HostToGpu -> GpuToHost First we apply this Opt: join(host_from_gpu) -> host_from_gpu(gpu_join) then, as an intermediate result, there should be host_from_gpu(gpu_join) -> HostToGpu -> GpuToHost this unnecessary GpuToHost -> HostToGpu should be removed by other opts, leaving us with host_from_gpu(gpu_join) For intermediate places in the graph not covered by the first opt, the following could be useful: gpu_from_host(join) -> gpu_join(gpu_from_host) not implemented yet. """ if isinstance(node.op, JoinUnsafe): # optimizing this case: # join(host_from_gpu) -> host_from_gpu(gpu_join) axis_and_tensors = node.inputs matches = [ t.dtype == 'float32' and ((t.owner is not None and isinstance(t.owner.op, HostFromGpu)) or isinstance(t, theano.gof.Constant)) for t in axis_and_tensors[1:] ] if all(matches): new_tensors = [ as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:] ] new_a_and_t = [axis_and_tensors[0]] + new_tensors replacement_node = host_from_gpu(GpuJoinUnsafe()(*new_a_and_t)) return [replacement_node]
def use_gpu_cumsum(node): if type(node.op) is CumsumOp \ and node.inputs[0].dtype == 'float32' \ and node.inputs[0].owner \ and isinstance(node.inputs[0].owner.op, HostFromGpu): axis = node.op.axis x = node.inputs[0] if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: return None x = gpu_from_host(x) if axis is None and x.ndim > 1: x = GpuFlatten()(x) # ``gpu_cumsum`` assume array has been flattened if needed. if axis is None: axis = 0 return [host_from_gpu(GpuCumsum(axis)(x))]
def local_gpu_advanced_subtensor1_floats(node): if isinstance(node.op, GpuFromHost): host_input = node.inputs[0] if host_input.owner and \ host_input.owner.op.__class__ is AdvancedSubtensor1Floats: x = host_input.owner.inputs[0] coords = host_input.owner.inputs[1:] return [ GpuAdvancedSubtensor1Floats(host_input.owner.op._tag)( as_cuda_ndarray_variable(x), *coords) ] if node.op.__class__ is AdvancedSubtensor1Floats: x = node.inputs[0] coords = node.inputs[1:] # print x.owner.op, x.type, node.op._tag # DEV if (x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32"): gpu_x, = x.owner.inputs return [ host_from_gpu( GpuAdvancedSubtensor1Floats(node.op._tag)(gpu_x, *coords)) ] return False
input1_nervana = to_gputensor(inputs[0][0]) input2_nervana = to_gputensor(inputs[1][0]) output_nervana = to_gputensor(z[0]) lib.dot(input1_nervana, input2_nervana, output_nervana, alpha=1, beta=0, relu=self.relu) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk nervana_dot = NervanaDot() if __name__ == "__main__": import theano.tensor as T x = theano.shared(np.random.randn(2000, 3000).astype(theano.config.floatX)) y = theano.shared(np.random.randn(3000, 1000).astype(theano.config.floatX)) prod1 = T.dot(x, y) prod2 = host_from_gpu(nervana_dot(x, y)) val1 = prod1.eval() val2 = prod2.eval() assert np.allclose(val1, val2)
def benchmark(n_imgs, n_channels, img_shape, n_filters, filter_shape, pad): print('\nn_imgs: %i, n_channels: %i, img_shape: (%i, %i), ' % ((n_imgs, n_channels) + img_shape) + 'n_filters: %i, filter_shape: (%i, %i), pad: %i' % ((n_filters,) + filter_shape + (pad,))) # Setup arrays img_h, img_w = img_shape filter_h, filter_w = filter_shape convout_h = img_h + 2*pad - filter_h + 1 convout_w = img_w + 2*pad - filter_w + 1 imgs_bc01_shape = (n_imgs, n_channels, img_h, img_w) filters_bc01_shape = (n_filters, n_channels, filter_h, filter_w) imgs_bc01 = np.random.randn(n_imgs, n_channels, img_h, img_w) imgs_c01b = np.transpose(imgs_bc01, (1, 2, 3, 0)) filters_fc01 = np.random.randn(n_filters, n_channels, filter_h, filter_w) filters_c01f = np.transpose(filters_fc01, (1, 2, 3, 0)) convout_bc01 = np.random.randn(n_imgs, n_filters, convout_h, convout_w) convout_c01b = np.transpose(convout_bc01, (1, 2, 3, 0)) imgs_bc01_t = theano.shared(imgs_bc01.astype(theano.config.floatX)) imgs_c01b_t = theano.shared(imgs_c01b.astype(theano.config.floatX)) filters_fc01_t = theano.shared(filters_fc01.astype(theano.config.floatX)) filters_c01f_t = theano.shared(filters_c01f.astype(theano.config.floatX)) convout_bc01_t = theano.shared(convout_bc01.astype(theano.config.floatX)) convout_c01b_t = theano.shared(convout_c01b.astype(theano.config.floatX)) # Forward propagation print('fprop') convout_cc_op = FilterActs(stride=1, partial_sum=4, pad=pad) convout_cc_expr = convout_cc_op(imgs_c01b_t, filters_c01f_t) convout_cc_fun = theano.function([], convout_cc_expr) convout_cc = convout_cc_fun() convout_cc = np.transpose(convout_cc, (3, 0, 1, 2)) convout_fft_op = ConvBC01(n_imgs, n_channels, n_filters, img_shape, filter_shape, (pad, pad)) convout_fft_expr = convout_fft_op(imgs_bc01_t, filters_fc01_t) convout_fft_fun = theano.function([], host_from_gpu(convout_fft_expr)) convout_fft = convout_fft_fun() print(' correct: ' + str(allclose(convout_fft, convout_cc))) duration_cc = avg_running_time(convout_cc_fun) convout_fft_fun = theano.function([], convout_fft_expr) duration_fft = avg_running_time(convout_fft_fun) print(' avg. duration: cuda_convnet: %.4f fft: %.4f' % (duration_cc, duration_fft)) print(' speedup: %.2f' % (duration_cc/duration_fft)) del convout_fft_op del convout_fft_expr del convout_fft_fun del convout_cc_op del convout_cc_expr del convout_cc_fun # Back propagation, imgs print('bprop_imgs') dimgs_cc_op = ImageActs(stride=1, partial_sum=1, pad=pad) dimgs_cc_expr = dimgs_cc_op(convout_c01b_t, filters_c01f_t) dimgs_cc_fun = theano.function([], dimgs_cc_expr) dimgs_cc = dimgs_cc_fun() dimgs_cc = np.transpose(dimgs_cc, (3, 0, 1, 2)) dimgs_fft_op = ConvBC01ImgsGrad(n_imgs, n_channels, n_filters, img_shape, filter_shape, (pad, pad)) dimgs_fft_expr = dimgs_fft_op(filters_fc01_t, convout_bc01_t) dimgs_fft_fun = theano.function([], host_from_gpu(dimgs_fft_expr)) dimgs_fft = dimgs_fft_fun() print(' correct: ' + str(allclose(dimgs_fft, dimgs_cc))) duration_cc = avg_running_time(dimgs_cc_fun) dimgs_fft_fun = theano.function([], dimgs_fft_expr) duration_fft = avg_running_time(dimgs_fft_fun) print(' avg. duration: cuda_convnet: %.4f fft: %.4f' % (duration_cc, duration_fft)) print(' speedup: %.2f' % (duration_cc/duration_fft)) del dimgs_fft_op del dimgs_fft_expr del dimgs_fft_fun del dimgs_cc_op del dimgs_cc_expr del dimgs_cc_fun # Back propagation, filters dfilters_cc_op = WeightActs(stride=1, partial_sum=1, pad=pad) dfilters_cc_expr = dfilters_cc_op(imgs_c01b_t, convout_c01b_t, T.as_tensor_variable(filter_shape)) dfilters_cc_fun = theano.function([], dfilters_cc_expr) dfilters_cc = dfilters_cc_fun()[0] dfilters_cc = np.transpose(dfilters_cc, (3, 0, 1, 2)) dfilters_fft_op = ConvBC01FiltersGrad(n_imgs, n_channels, n_filters, img_shape, filter_shape, (pad, pad)) dfilters_fft_expr = dfilters_fft_op(imgs_bc01_t, convout_bc01_t) dfilters_fft_fun = theano.function([], host_from_gpu(dfilters_fft_expr)) dfilters_fft = dfilters_fft_fun() print('bprop_filters') print(' correct: ' + str(allclose(dfilters_fft, dfilters_cc))) duration_cc = avg_running_time(dfilters_cc_fun) dfilters_fft_fun = theano.function([], dfilters_fft_expr) duration_fft = avg_running_time(dfilters_fft_fun) print(' avg. duration: cuda_convnet: %.4f fft: %.4f' % (duration_cc, duration_fft)) print(' speedup: %.2f' % (duration_cc/duration_fft))
lib.dot(input1_nervana, input2_nervana, output_nervana, alpha=1, beta=0, relu=self.relu) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk nervana_dot = NervanaDot() if __name__ == "__main__": import theano.tensor as T x = theano.shared(np.random.randn(2000, 3000).astype(theano.config.floatX)) y = theano.shared(np.random.randn(3000, 1000).astype(theano.config.floatX)) prod1 = T.dot(x, y) prod2 = host_from_gpu(nervana_dot(x, y)) val1 = prod1.eval() val2 = prod2.eval() assert np.allclose(val1, val2)
#m = N #n = N #k = N #m = 784 #n = 512 #k = 10 m = 10000 n = 4096 k = 10 print(m, n, k) A = T.fmatrix() B = T.fmatrix() dot1 = theano.function([A, B], T.dot(A, B)) dot2 = theano.function([A, B], host_from_gpu(gemm(A, B))) dot3 = theano.function([A, B], host_from_gpu(magma_gemm(A, B))) dot4 = theano.function([A, B], host_from_gpu(xnor_gemm(A, B))) dot5 = theano.function([A, B], host_from_gpu(magma_mod_gemm(A, B))) # Generating random BINARY matrices a = SignNumpy(np.random.randn(m, n)) b = SignNumpy(np.random.randn(n, k)) # a = np.float32(np.random.randn(m, n)) # b = np.float32(np.random.randn(n, k)) start_time = time.time() c1 = dot1(a, b) dot1_duration = time.time() - start_time # print c1[0][0] print("Theano time = " + str(dot1_duration) + "s")
# Test suite if __name__ == "__main__": # N = 8192 N = 4096 m = N n = N k = N # m = 784 # n = 512 # k = 10 A = T.fmatrix() B = T.fmatrix() dot1 = theano.function([A, B], T.dot(A, B)) dot2 = theano.function([A, B], host_from_gpu(gemm(A, B))) dot3 = theano.function([A, B], host_from_gpu(xnor_gemm(A, B))) # Generating random BINARY matrices a = SignNumpy(np.random.randn(m, n)) b = SignNumpy(np.random.randn(n, k)) # a = np.float32(np.random.randn(m, n)) # b = np.float32(np.random.randn(n, k)) start_time = time.time() c1 = dot1(a, b) dot1_duration = time.time() - start_time # print c1[0][0] print("Theano time = " + str(dot1_duration) + "s") start_time = time.time()
# return vector_times_vector_grad(x,y,gz) vector_times_vector=VectorTimesVector() import numpy from theano import tensor import scipy from scipy import io a=tensor.vector('a',dtype='float32') b=tensor.vector('b',dtype='float32') c=vector_times_vector(a,b) f=theano.function([a,b],host_from_gpu(c)) #ga,gb=theano.grad(c.sum(),[a,b]) #g=theano.function([a,b],[ga,gb]) x=numpy.random.randn(1000).astype('float32') y=numpy.random.randn(1000).astype('float32') z=f(x,y) print 'x' print x print 'y' print y print 'z' print z
# Test suite if __name__ == "__main__": # N = 8192 N = 4096 m = N n = N k = N # m = 784 # n = 512 # k = 10 A = T.fmatrix() B = T.fmatrix() dot1 = theano.function([A,B], T.dot(A, B)) dot2 = theano.function([A,B], host_from_gpu(gemm(A, B))) dot3 = theano.function([A,B], host_from_gpu(xnor_gemm(A,B))) # Generating random BINARY matrices a = SignNumpy(np.random.randn(m, n)) b = SignNumpy(np.random.randn(n, k)) # a = np.float32(np.random.randn(m, n)) # b = np.float32(np.random.randn(n, k)) start_time = time.time() c1 = dot1(a,b) dot1_duration = time.time() - start_time # print c1[0][0] print("Theano time = "+str(dot1_duration)+"s") start_time = time.time()