def fit(self,X,Budget=None,W=None): self.X = cuda.to_device(X.astype(np.float64,order='F')) self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F') self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F') self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F') self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F') if Budget is None: permutation = np.random.permutation(self.X.shape[0]) self.permutation = cuda.to_device(permutation) initBudget(self.X,self.permutation,self.Budget) else: self.Budget = cuda.to_device(Budget.astype(np.float64,order='F')) self.calculateKB() self.calculateKX() if W is None: self.initW() else: self.W = cuda.to_device(W.astype(np.float64,order='F')) self.t = 0 for i in xrange(self.epochs): print "Epoch " + str(i) samples,features = self.X.shape permutation = getPermutation(samples,self.miniBatchSize) self.permutation = cuda.to_device(permutation) for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize): loadBatch(self.kx,self.permutation,j,self.kxi) self.nextW() self.t += 1 self.predictH()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = 512 gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream) d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream) prng.normal(d_normdist, 0, 1) qrng.generate(d_seed) d_paths = cuda.to_device(paths, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) griddim = gridsz, 1 blockdim = blksz, 1, 1 cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1, d_normdist, d_seed) d_paths.to_host(stream) stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = 512 gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream) d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream) prng.normal(d_normdist, 0, 1) qrng.generate(d_seed) d_paths = cuda.to_device(paths, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) griddim = gridsz, 1 blockdim = blksz, 1, 1 cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1, d_normdist, d_seed) d_paths.to_host(stream) stream.synchronize()
def __init__(self,budgetSize,latentTopics,miniBatchSize,epochs,Gamma,Lambda,Alpha,metric='rbf',sigma=1.0): """ OKMF class: OKMF is a method to perform a matrix factorization in a feature space. Parameters ---------- budgetSize : int Budget size. latentTopics: int Latent topics. miniBatchSize : int Size of minibatch. epochs : int Number of epochs. Gamma : float Gamma parameter Lambda : float Lambda parameter Alpha: float Alpha parameter metric : string Type of kernel. Default rbf sigma : float RBF kernel sigma parameter. Default 1.0. """ self.budgetSize = budgetSize self.latentTopics = latentTopics self.miniBatchSize = miniBatchSize self.epochs = epochs self.Gamma = Gamma self.Lambda = Lambda self.Alpha = Alpha self.metric = metric self.sigma = sigma self.W = None self.h = cuda.device_array((latentTopics,miniBatchSize),dtype=np.float64,order='F') self.KB = cuda.device_array((budgetSize,budgetSize),dtype=np.float64,order='F') self.kxi = cuda.device_array((budgetSize,miniBatchSize),dtype=np.float64,order='F') self.Blas = cublas.Blas() self.X = None self.Budget = None self.permutation = None self.kx = None self.Wkx = None self.H = None self.KBW = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F') self.KBWh = cuda.device_array((budgetSize,miniBatchSize),dtype=np.float64,order='F') self.KBWhh = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F') self.grad = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F') self.kxih = cuda.device_array((budgetSize,latentTopics),dtype=np.float64,order='F') self.WKBW = cuda.device_array((latentTopics,latentTopics),dtype=np.float64,order='F') self.Wkxi = cuda.device_array((latentTopics,miniBatchSize),dtype=np.float64,order='F') eyeAlpha = np.eye(latentTopics) * Alpha self.eyeAlpha = cuda.to_device(eyeAlpha.astype(np.float64,order='F'))
def T(self, a, out=None): """Returns the transpose of a 2D array. Parameters ---------- a : array-like Numpy or DeviceNDArray to transpose. out : DeviceNDArray (optional) Array to overwrite with result. """ a, out_dtype = _check_array(a) if type(out) == cuda.cudadrv.devicearray.DeviceNDArray: pass elif out == None: pass else: raise NotImplementedError a_dim = a.shape if a.ndim == 2: if out is None: out = cuda.device_array((a_dim[1],a_dim[0]),dtype=out_dtype,order='F') elif out.shape[0] == a_dim[1] and out.shape[1] == a_dim[0]: pass else: raise NotImplementedError else: raise NotImplementedError self.blas.geam('T','T',a_dim[1],a_dim[0],1.,a,0.,a,out) return out
def tanh(self, a, out=None): """Tanh of input. Parameters ---------- a : array-like Array to rectify. """ a, out_dtype = _check_array(a) a_dim = a.shape if type(out) == cuda.cudadrv.devicearray.DeviceNDArray: pass elif out is None: pass else: raise NotImplementedError if out is None: out = cuda.device_array(shape=a_dim, dtype=out_dtype, order='F') elif out.shape == a_dim: pass else: raise ValueError('matrices are not aligned') if a.ndim == 2: griddim2 = (int(ceil(a_dim[0]/self.blockdim2[0])),int(ceil(a_dim[1]/self.blockdim2[1]))) tanh_m[griddim2, self.blockdim2, self.stream](a, out) elif a.ndim == 1: griddim = int(ceil(a_dim[0]/self.blockdim)) tanh_v[griddim, self.blockdim, self.stream](a, out) else: raise NotImplementedError return out
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> step_cfg = step[gridsz, blksz, stream] d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last, stream=stream) d_last = d_paths stream.synchronize()
def __init__(self, shape, dtype, prealloc): self.device = cuda.get_current_device() self.freelist = deque() self.events = {} for i in range(prealloc): gpumem = cuda.device_array(shape=shape, dtype=dtype) self.freelist.append(gpumem) self.events[gpumem] = cuda.event(timing=False)
def preScan(out_d, in_d, in_size): threads_per_block = (BLOCK_SIZE, 1) nBlocks = int(ceil(in_size / (2 * 1.0 * BLOCK_SIZE))) number_of_blocks = (nBlocks, 1) aux_d = cuda.device_array(nBlocks, dtype=np.uint32) aux_od = cuda.device_array(nBlocks, dtype=np.uint32) exclusiveScanGPU [number_of_blocks, threads_per_block] (aux_d, out_d, in_d, in_size) if nBlocks > 1: preScan(aux_od, aux_d, nBlocks) else: aux_od = aux_d exclusiveCombineGPU [number_of_blocks, threads_per_block] (out_d, aux_od, in_size)
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def mean(self, a, out=None, axis=None): """Average array elements. Parameters ---------- a : array-like Array to average. out : array-like Result will be stored in this array. axis : int 1 or 0 for 2D arrays. """ a, out_dtype = _check_array(a) a_dim = a.shape if a.ndim == 2: if axis is None: a_strides = a.strides d_flat_a = _cu_reshape(a, (np.prod(a_dim),), (a_strides[0],), out_dtype) out = self.blas.asum(d_flat_a)/float(np.prod(a_dim)) elif axis == 0: if out is None: out = cuda.device_array(a_dim[1], dtype=out_dtype, order='F') elif out.shape[0] == a_dim[1]: pass else: raise ValueError('matrices are not aligned') griddim = int(ceil(a_dim[1]/self.blockdim)) mean_0[griddim, self.blockdim, self.stream](a, float(a_dim[0]), out) elif axis == 1: if out is None: out = cuda.device_array(a_dim[0], dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0]: pass else: raise ValueError('matrices are not aligned') griddim = int(ceil(a_dim[0]/self.blockdim)) mean_1[griddim, self.blockdim, self.stream](a, float(a_dim[1]), out) elif a.ndim == 1: out = self.blas.asum(a)/float(np.prod(a_dim)) pass else: raise NotImplementedError return out
def mtranspose(a): blockDim = (min(32,a.shape[0]),min(32,a.shape[1])) gridDim = ((((a.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((a.shape[1] + blockDim[1]) - 1) / blockDim[1])) b = cuda.device_array((a.shape[1],a.shape[0]),dtype='float32') d_mtranspose[gridDim,blockDim](a,b) return b
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib.cublas import Blas # @UnresolvedImport from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream) cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream) cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()
def montecarlo_datamgmt(paths, dt, interest, volatility): c0 = interest - 0.5 * volatility ** 2 c1 = volatility * np.sqrt(dt) prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) d_noises = cuda.device_array(paths.shape[0]) d_curLast = cuda.to_device(paths[:,0]) # Copy first set of stock prices to the GPU d_curNext = cuda.device_array(paths.shape[0]) # Create an empty array to hold the next set of calculated prices for j in xrange(1, paths.shape[1]): # for each time step # Generate gaussian noises for simulation prng.normal(d_noises, 0., 1.) # Call the GPU-acclereated step function to calculate the next set of prices d_curNext = step(d_curLast, dt, c0, c1, d_noises) # Copy calculated prices to host d_curNext.copy_to_host(paths[:,j]) # Swap the prices so the "last" prices was the one we just copied # to the host. d_curNext, d_curLast = d_curLast, d_curNext
def most_similar(a): assert a.shape[1] == d_vectors.shape[1], "Size Mismatch: (%i,%i), (%i,%i)" %(a.shape[0],a.shape[1],d_vectors.shape[0],d_vectors.shape[1]) blockDim = (1024) gridDim = (((d_vectors.shape[0] + blockDim) - 1) / blockDim) val = cuda.device_array((1,d_vectors.shape[0]),dtype='float32') d_distances[gridDim,blockDim](a,d_vectors,val) _,idx = margmin(val) return inv_vocab[idx]
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = NUM_ELEMENTS - i - 1 in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() for bit_shift in range(0, 32): tk1 = time() #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift) preScan(out_d, in_d, NUM_ELEMENTS) tk2 = time() #print bit_shift, tk2 - tk1 in_d = out_d out_d = temp_d temp_d = in_d tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = [NUM_ELEMENTS - i - 1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = NUM_ELEMENTS - i - 1 in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() for bit_shift in range(0, 32): tk1 = time() #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift) preScan(out_d, in_d, NUM_ELEMENTS) tk2 = time() #print bit_shift, tk2 - tk1 in_d = out_d out_d = temp_d temp_d = in_d tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = [NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1
def flush(self, metric_opt, supp_opt): if not self.Vcs: # Nothing to do return metric_opt, supp_opt k = self.k V = self.V topk_list = [] nodect = V.shape[0] numseg = len(self.Vcs) assert nodect assert numseg eachsize = nodect * numseg D = np.zeros(eachsize, dtype=np.float32) # Fill buffer for segmented sort for i, Vc in enumerate(self.Vcs): D[i * nodect:(i + 1) * nodect] = Vc[:, 0] # Prepare for GPU segmented sort dD = cuda.to_device(D) dI = cuda.device_array((numseg, nodect), dtype=np.uint32) blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz)](dI) if numseg == 1: segments = np.arange(1, dtype=np.int32) else: segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect segmented_sort(dD, dI, cuda.to_device(segments)) for i in range(numseg): topk = dI[i, -k:].copy_to_host() topk_list.append(topk) # Reduce for topk in topk_list: # Assume A is huge metric = np.linalg.norm(V[topk, :]) ** 2 if metric > metric_opt: metric_opt = metric supp_opt = topk # Clear all Vc self.Vcs.clear() return metric_opt, supp_opt
def diag(self, a, out=None): """Creates vector from diagonal of matrix or matrix with diagonal from vector. Parameters ---------- a : array-like Vector or array from which to take diagonal. out : array-like, optional Output array. """ a, out_dtype = _check_array(a) a_dim = a.shape if a.ndim == 2: if out is None: out = cuda.device_array(shape=a_dim[0], dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0] and out.ndim == 1: pass else: raise ValueError('matrices are not aligned') griddim = int(ceil(a_dim[0]/self.blockdim)) diag2v[griddim, self.blockdim, self.stream](a, out) elif a.ndim == 1: if out is None: out = cuda.device_array(shape=(a_dim[0],a_dim[0]), dtype=out_dtype, order='F') elif out.shape == (a_dim[0], a_dim[0]): pass else: raise ValueError('matrices are not aligned') griddim2 = (int(ceil(a_dim[0]/self.blockdim2[0])), int(ceil(a_dim[0]/self.blockdim2[1]))) diag2m[griddim2, self.blockdim2, self.stream](a, out) else: raise NotImplementedError return out
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths
def mc_cuda(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # instantiate a CUDA stream for queueing async CUDA cmds stream = cuda.stream() # instantiate a cuRAND PRNG prng = curand.PRNG(curand.PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # configure the kernel # similar to CUDA-C: step_cuda<<<gridsz, blksz, 0, stream>>> step_cfg = step_cuda[gridsz, blksz, stream] # transfer the initial prices d_last = cuda.to_device(paths[:, 0], stream=stream) for j in range(1, paths.shape[1]): # call cuRAND to populate d_normdist with gaussian noises prng.normal(d_normdist, mean=0, sigma=1) # setup memory for new prices # device_array_like is like empty_like for GPU d_paths = cuda.device_array_like(paths[:, j], stream=stream) # invoke step kernel asynchronously step_cfg(d_last, d_paths, dt, c0, c1, d_normdist) # transfer memory back to the host d_paths.copy_to_host(paths[:, j], stream=stream) d_last = d_paths # wait for all GPU work to complete stream.synchronize()
def const(self, shape, value, out=None): if type(shape) != tuple: shape = (shape,) assert len(shape) > 0 if len(shape) > 2: raise NotImplementedError if out is None: out = cuda.device_array(shape=shape, dtype=np.float32, order='F') if out.shape != shape: raise ValueError('matrices are not aligned') out_dim = out.shape if out.ndim == 2: griddim2 = (int(ceil(out_dim[0]/self.blockdim2[0])),int(ceil(out_dim[1]/self.blockdim2[1]))) const_m[griddim2, self.blockdim2, self.stream](out, value) elif out.ndim == 1: griddim = int(ceil(out_dim[0]/self.blockdim)) const_v[griddim, self.blockdim, self.stream](out, value) else: raise NotImplementedError return out
bitmap[x, y, 1] = int(g * 255.) bitmap[x, y, 2] = int(b * 255.) bitmap[x, y, 3] = 255 if __name__ == "__main__": start = timer() # Create a container for the pixel RGBA information of our image bitmap = np.zeros([DIM, DIM, 4], dtype=np.int16) # Copy to device memory d_bitmap = cuda.to_device(bitmap) # Create empty container for our Sphere data on device d_spheres = cuda.device_array(SPHERES, dtype=Sphere_t) # Create an empty container of spheres on host, and populate it # with some random data. temp_spheres = np.empty(SPHERES, dtype=Sphere_t) for i in xrange(SPHERES): temp_spheres[i]['r'] = rnd(1.0) temp_spheres[i]['g'] = rnd(1.0) temp_spheres[i]['b'] = rnd(1.0) temp_spheres[i]['x'] = rnd(DIM) - DIM / 2 temp_spheres[i]['y'] = rnd(DIM) - DIM / 2 temp_spheres[i]['z'] = rnd(DIM) - DIM / 2 temp_spheres[i]['radius'] = rnd(100.0) + 20 if VERBOSE: sph = temp_spheres[i]
def _update(d): stream1 = cuda.stream() stream2 = cuda.stream() stream3 = cuda.stream() stream4 = cuda.stream() step = d['step'] #print "Step: {}".format(step) """Calculate the pressure gradient. Two steps are needed for this.""" # Calculate FFT of pressure. fft(d['field']['p'], d['temp']['fft_p'], stream=stream1) stream1.synchronize() #print "FFT pressure: {}".format(d['temp']['fft_p'].copy_to_host()) #pressure_exponent_x = exp(pressure_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!! #pressure_exponent_y = exp(pressure_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!! #print(d['spacing'].shape) #print(d['k_x'].shape) ex = cuda.device_array(shape=d['field']['p'].shape) print(d['k_x'].shape) print(d['spacing'].shape) print(d['k_x'].dtype) print(d['spacing'].dtype) print(pressure_gradient_exponent(d['k_x'], d['spacing'])) ex = pressure_gradient_exponent(d['k_x'], d['spacing'])#, stream=stream1) ey = pressure_gradient_exponent(d['k_y'], d['spacing'])#, stream=stream2) pressure_exponent_x = exp(ex, stream=stream1) # This is a constant!! pressure_exponent_y = exp(ey, stream=stream2) # This is a constant!! stream1.synchronize() stream2.synchronize() #print ( to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x) ).copy_to_host() """Calculate the velocity gradient.""" ifft(to_gradient(d['temp']['fft_p'], d['k_x'], d['kappa'], pressure_exponent_x, stream=stream1), d['temp']['d_p_d_x'], stream=stream1) ifft(to_gradient(d['temp']['fft_p'], d['k_y'], d['kappa'], pressure_exponent_y, stream=stream2), d['temp']['d_p_d_y'], stream=stream2) #print "Pressure gradient x: {}".format( d['temp']['d_p_d_x'].copy_to_host() ) #print "Pressure gradient y: {}".format( d['temp']['d_p_d_y'].copy_to_host() ) """Calculate the velocity.""" d['field']['v_x'] = velocity_with_pml(d['field']['v_x'], d['temp']['d_p_d_x'], d['timestep'], d['density'], d['abs_exp']['x'], d['source']['v']['x'][step], stream=stream1) d['field']['v_y'] = velocity_with_pml(d['field']['v_y'], d['temp']['d_p_d_y'], d['timestep'], d['density'], d['abs_exp']['y'], d['source']['v']['y'][step], stream=stream2) stream1.synchronize() stream2.synchronize() """Fourier transform of the velocity.""" fft(d['field']['v_x'], d['temp']['fft_v_x'], stream=stream1) fft(d['field']['v_y'], d['temp']['fft_v_y'], stream=stream2) stream1.synchronize() stream2.synchronize() #print d['temp']['fft_v_y'].copy_to_host() #print "Velocity x: {}".format(d['field']['v_x'].copy_to_host()) #print "Velocity y: {}".format(d['field']['v_y'].copy_to_host()) #print "Source: {}".format(d['source']['p'][step].copy_to_host()) #print "Source: {}".format(d['source']['p']) #print "Velocity exponent y: {}".format(velocity_exponent_y.copy_to_host()) stream1.synchronize() stream2.synchronize() #stream3.synchronize() #stream4.synchronize() velocity_exponent_x = exp(velocity_gradient_exponent(d['k_x'], d['spacing'], stream=stream1), stream=stream1) # This is a constant!! velocity_exponent_y = exp(velocity_gradient_exponent(d['k_y'], d['spacing'], stream=stream2), stream=stream2) # This is a constant!! ifft(to_gradient(d['temp']['fft_v_x'], d['k_x'], d['kappa'], velocity_exponent_x, stream=stream1), d['temp']['d_v_d_x'], stream=stream1) ifft(to_gradient(d['temp']['fft_v_y'], d['k_y'], d['kappa'], velocity_exponent_y, stream=stream2), d['temp']['d_v_d_y'], stream=stream2) """And finally the pressure.""" #print len([ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ]) #pressure_with_pml( d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ) #for i in [ d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step] ]: #print i , i.shape #print i.copy_to_host() #try: #print i.dtype #except AttributeError: #print 'None' stream1.synchronize() stream2.synchronize() #print "Velocity gradient x: {}".format(d['temp']['d_v_d_x'].copy_to_host()) #print "Velocity gradient y: {}".format(d['temp']['d_v_d_y'].copy_to_host()) #print "Pressure x previous: {}".format(d['temp']['p_x'].copy_to_host()) #print "Pressure y previous: {}".format(d['temp']['p_y'].copy_to_host()) #print "Abs exp x: {}".format( d['abs_exp']['x'].copy_to_host()) #print "Abs exp y: {}".format( d['abs_exp']['y'].copy_to_host()) d['temp']['p_x'] = pressure_with_pml(d['temp']['p_x'], d['temp']['d_v_d_x'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['x'], d['source']['p'][step], stream=stream1) d['temp']['p_y'] = pressure_with_pml(d['temp']['p_y'], d['temp']['d_v_d_y'], d['timestep'], d['density'], d['soundspeed'], d['abs_exp']['y'], d['source']['p'][step], stream=stream2) stream1.synchronize() stream2.synchronize() #try: #print "Source p: {}".format(d['source']['p'][step].copy_to_host()) #except AttributeError: #print "Source p: {}".format(d['source']['p'][step]) #print "Pressure x: {}".format(d['temp']['p_x'].copy_to_host()) #print "Pressure y: {}".format(d['temp']['p_y'].copy_to_host()) d['field']['p'] = add(d['temp']['p_x'], d['temp']['p_y'], stream=stream3) #stream3.synchronize() #print "Pressure total: {}".format(d['field']['p'].copy_to_host()) stream1.synchronize() stream2.synchronize() stream3.synchronize() return d
def test_apriori(): output_file = open("apriori_out.txt", "w") offsets, transactions, num_transactions, num_elements = readFile( "syncthetic_data.txt") print "Offset = ", offsets[:num_transactions] print "transactions = ", transactions[:num_elements] print "Num transactions = ", num_transactions print "Num elements = ", num_elements min_support = MIN_SUPPORT # to find number of max digits required to represent that many number of unique items power = 1 while MAX_UNIQUE_ITEMS / (10**power) != 0: power += 1 print "Power = ", power t = [item for item in transactions.tolist()] if num_elements > NUM_ELEMENTS: print "Error: Elements exceeding NUM_ELEMENTS. Exiting..." sys.exit(12) input_h = np.array(t, dtype=np.int32) print "Input transactions = ", list(input_h) print "Size of transactions = ", input_h.size ci_h = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.int32) li_h = np.empty(MAX_UNIQUE_ITEMS, dtype=np.int32) input_d = cuda.to_device(input_h) ci_d = cuda.to_device(ci_h) li_d = cuda.device_array(MAX_UNIQUE_ITEMS, dtype=np.int32) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1) #((NUM_ELEMENTS / threads_per_block[0]) + 1, 1) histogramGPU[number_of_blocks, threads_per_block](input_d, ci_d, num_elements) #cuda.synchronize() ci_d.copy_to_host(ci_h) print "Ci_H Histogram result = ", ci_h # support count for each item number_of_blocks = (int( ceil(MAX_UNIQUE_ITEMS / (1.0 * threads_per_block[0]))), 1) pruneGPU[number_of_blocks, threads_per_block](ci_d, MAX_UNIQUE_ITEMS, min_support) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Keys = ", [i for i in range(0, len(ci_h))] print "Ci_H Pruning result = ", ci_h # support count for each item # calculate concise list of items satisfying min support l1_patterns = {} k = 0 # number of items whose sup_count > min_support for j in range(0, len(ci_h)): if ci_h[j] != 0: li_h[k] = j l1_patterns[(j, )] = ci_h[j] k += 1 print "\n=======================================================\n" print "L1 = ", list(li_h)[:k] #items whose support_count > min_support print "\n=======================================================\n" output_file.write(createFormattedPatterns(l1_patterns, 1)) print "K(num_items_with_good_sup_count = ", k #k = 102 ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32) ci_d = cuda.to_device(ci_h) #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32) #tli_h = np.array([i for i in range(1, k + 1)], dtype=np.int32) t1 = time() li_d = cuda.to_device(li_h) number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1) print "Self join 2 number of blocks = ", number_of_blocks print "K = ", k print "Ci_H size = ", ci_h.size print "LI_H size = ", li_h.size selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power) cuda.synchronize() li_d.copy_to_host(li_h) ci_d.copy_to_host(ci_h) t2 = time() #sys.exit(0) # f = open('join.txt', 'w') # # for i in range(0, k): # line = "" # for j in range(0, k): # line += str(ci_h[k * i + j]) + " " # f.write(line + "\n") # # f.close() #ci_h = ci_h.reshape(k, k) print "Initial Mask = ", ci_h.reshape(k, k) print "Self joining time = ", (t2 - t1) d_offsets = cuda.to_device(offsets) d_transactions = cuda.to_device(transactions) #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) number_of_blocks = (int( ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1) print "Num blocks for findFrequency = ", number_of_blocks print "Num transactions = ", num_transactions print "Num patterns = ", k print "index = ", list(li_h)[:k] findFrequencyGPU[number_of_blocks, threads_per_block](d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Final Mask = ", ci_h.reshape(k, k) d_transactions.copy_to_host(transactions) threads_per_block = (BLOCK_SIZE, BLOCK_SIZE) number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0]))))) pruneMultipleGPU[number_of_blocks, threads_per_block]( ci_d, k, min_support) # prunes according to min_support ci_d.copy_to_host(ci_h) print "Outer Mask = ", ci_h.reshape(k, k) ci_hn = np.zeros(k, dtype=np.int32) ci_dn = cuda.to_device(ci_hn) combinationsAvailable[threads_per_block, number_of_blocks]( ci_d, ci_dn, k) #Number of possible patterns in each row ci_dn.copy_to_host(ci_hn) print "Ci_hn = ", list(ci_hn) ci_hnx = np.empty(k, dtype=np.int32) ci_dnx = cuda.to_device(ci_hnx) preScan(ci_dnx, ci_dn, k) # Prefix sum on patterns in each row ci_dnx.copy_to_host(ci_hnx) num_patterns = ci_hnx[-1] print "Ci_hnx = ", list(ci_hnx) sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32) sparseM_d = cuda.to_device(sparseM_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1) convert2Sparse[threads_per_block, number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k) sparseM_d.copy_to_host(sparseM_h) # sparseM_h = sparseM_h.reshape(3, num_patterns) print sparseM_h.reshape(3, num_patterns) patterns = {} for i in range(0, num_patterns): item1 = sparseM_h[i] item2 = sparseM_h[i + num_patterns] support = sparseM_h[i + 2 * num_patterns] patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support print "\n=======================================================\n" print "L2 = ", patterns print "\n=======================================================\n" output_file.write(createFormattedPatterns(patterns, 2)) new_modulo_map = {} index_id = 1 actual_pattern_items = [] index_items_lookup = [] #patterns = {(2, 3, 5) : 1, (2, 3, 6) : 1, (2, 3, 7) : 1, (2, 4, 5) : 1, (2, 4, 7) : 1, (3, 5, 7) : 1} for pattern in sorted(patterns.keys()): if pattern[:-1] not in new_modulo_map: new_modulo_map[pattern[:-1]] = index_id prev_len = len(actual_pattern_items) pattern_len = len(pattern[:-1]) actual_pattern_items += pattern[:-1] index_items_lookup += [index_id, prev_len, pattern_len] index_id += 1 if (pattern[-1], ) not in new_modulo_map: new_modulo_map[(pattern[-1], )] = index_id prev_len = len(actual_pattern_items) pattern_len = len([pattern[-1]]) actual_pattern_items += [pattern[-1]] index_items_lookup += [index_id, prev_len, pattern_len] index_id += 1 #print "Actual pattern items = ", actual_pattern_items #print "Index lookup = ", index_items_lookup print new_modulo_map new_patterns = [] for pattern in patterns: new_patterns.append( (new_modulo_map[pattern[:-1]], new_modulo_map[(pattern[-1], )])) print new_patterns new_new_pattern = [] for pattern in new_patterns: new_new_pattern.append(pattern[0] * 10**power + pattern[1]) new_new_pattern.sort() print new_new_pattern k = len(new_new_pattern) li_h = np.array(new_new_pattern, dtype=np.int32) ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32) ci_d = cuda.to_device(ci_h) #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32) t1 = time() li_d = cuda.to_device(li_h) number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1) selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power) li_d.copy_to_host(li_h) ci_d.copy_to_host(ci_h) api_h = np.array(actual_pattern_items, dtype=np.int32) iil_h = np.array(index_items_lookup, dtype=np.int32) api_d = cuda.to_device(api_h) iil_d = cuda.to_device(iil_h) print "Api_h = ", list(api_h), " Size = ", api_h.size print "IIL_H = ", list(iil_h), " Size = ", iil_h.size t2 = time() print "LI_H = ", li_h print "Initial Mask = ", ci_h.reshape(k, k) #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) number_of_blocks = (int( ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1) print "Num transactions = ", num_transactions print "Num patterns = ", k print "index = ", li_h print "Size of api_d = ", api_h.size print "Size of iil_h = ", iil_h.size findHigherPatternFrequencyGPU[number_of_blocks, threads_per_block](d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k, api_d, iil_d, power, api_h.size, iil_h.size) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Final Mask = ", ci_h.reshape(k, k) #d_transactions.copy_to_host(transactions) #print transactions[:num_elements] threads_per_block = (BLOCK_SIZE, BLOCK_SIZE) number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0]))))) pruneMultipleGPU[number_of_blocks, threads_per_block](ci_d, k, min_support) ci_d.copy_to_host(ci_h) print "Outer Mask = ", ci_h.reshape(k, k) print "K = ", k ci_hn = np.zeros(k, dtype=np.int32) ci_dn = cuda.to_device(ci_hn) combinationsAvailable[threads_per_block, number_of_blocks](ci_d, ci_dn, k) ci_dn.copy_to_host(ci_hn) print "Ci_hn = ", list(ci_hn) ci_hnx = np.empty(k, dtype=np.int32) ci_dnx = cuda.to_device(ci_hnx) preScan(ci_dnx, ci_dn, k) ci_dnx.copy_to_host(ci_hnx) num_patterns = ci_hnx[-1] print list(ci_hnx) sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32) sparseM_d = cuda.to_device(sparseM_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1) print "K = ", k convert2Sparse[threads_per_block, number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k) sparseM_d.copy_to_host(sparseM_h) # sparseM_h = sparseM_h.reshape(3, num_patterns) print sparseM_h.reshape(3, num_patterns) patterns = {} for i in range(0, num_patterns): item1 = sparseM_h[i] item2 = sparseM_h[i + num_patterns] support = sparseM_h[i + 2 * num_patterns] patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support print patterns actual_patterns = {} for pattern in patterns: v_common_pat = pattern[0] / (10**power) vitem1 = pattern[0] % (10**power) vitem2 = pattern[1] % (10**power) item1 = actual_pattern_items[index_items_lookup[(vitem1 - 1) * 3 + 1]] item2 = actual_pattern_items[index_items_lookup[(vitem2 - 1) * 3 + 1]] common_pat_start = index_items_lookup[(v_common_pat - 1) * 3 + 1] common_pat_length = index_items_lookup[(v_common_pat - 1) * 3 + 2] common_pat_end = common_pat_start + common_pat_length common_pattern = actual_pattern_items[common_pat_start:common_pat_end] pattern_key = tuple(common_pattern) + tuple(sorted([item1, item2])) actual_patterns[pattern_key] = patterns[pattern] print "\n=======================================================\n" print "L3 = ", actual_patterns print "\n=======================================================\n" output_file.write(createFormattedPatterns(actual_patterns, 3)) output_file.close()
def backward(dY, cache, g_WLSTM): Wd = cache['Wd'] Hout = cache['Hout'] IFOG = cache['IFOG'] IFOGf = cache['IFOGf'] C = cache['C'] Hin = cache['Hin'] g_Hin = cuda.to_device(np.asfortranarray(Hin.T)) WLSTM = cache['WLSTM'] X = cache['X'] tanhC_version = cache['tanhC_version'] drop_prob_encoder = cache['drop_prob_encoder'] drop_prob_decoder = cache['drop_prob_decoder'] n, d = Hout.shape # we have to add back a row of zeros, since in the forward pass # this information was not used. See NOTE1 above. dY = np.row_stack([np.zeros(dY.shape[1]), dY]) # backprop the decoder dWd = Hout.transpose().dot(dY) dbd = np.sum(dY, axis=0, keepdims=True) dHout = dY.dot(Wd.transpose()) # backprop dropout, if it was applied if drop_prob_decoder > 0: dHout *= cache['U2'] # backprop the LSTM dIFOG = np.array(np.zeros(IFOG.shape), order='F') dIFOGf = np.zeros(IFOGf.shape) dWLSTMCp = np.array(np.zeros(WLSTM.shape), order='F') dWLSTM = cuda.device_array(dWLSTMCp.shape, order='F') dWLSTM.copy_to_device(dWLSTMCp) dHin = np.array(np.zeros((1, Hin.shape[1])), order='F') g_dHin = cuda.device_array((1, Hin.shape[1]), order='F') dC = np.zeros(C.shape) dX = np.zeros(X.shape) for t in reversed(xrange(n)): if tanhC_version: tanhCt = np.tanh(C[t]) # recompute this here dIFOGf[t, 2 * d:3 * d] = tanhCt * dHout[t] # backprop tanh non-linearity first then continue backprop dC[t] += (1 - tanhCt**2) * (IFOGf[t, 2 * d:3 * d] * dHout[t]) else: dIFOGf[t, 2 * d:3 * d] = C[t] * dHout[t] dC[t] += IFOGf[t, 2 * d:3 * d] * dHout[t] if t > 0: dIFOGf[t, d:2 * d] = C[t - 1] * dC[t] dC[t - 1] += IFOGf[t, d:2 * d] * dC[t] dIFOGf[t, :d] = IFOGf[t, 3 * d:] * dC[t] dIFOGf[t, 3 * d:] = IFOGf[t, :d] * dC[t] # backprop activation functions dIFOG[t, 3 * d:] = (1 - IFOGf[t, 3 * d:]**2) * dIFOGf[t, 3 * d:] y = IFOGf[t, :3 * d] dIFOG[t, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :3 * d] # backprop matrix multiply #dWLSTM += np.outer(Hin[t], dIFOG[t]) #dHin[t] = dIFOG[t].dot(WLSTM.transpose()) g_dIFOG = cuda.to_device(dIFOG[t:t + 1]) g_dHin, dWLSTM = backMultSubroutine(g_Hin[:, t:t + 1], g_WLSTM, g_dIFOG, dWLSTM, g_dHin) g_dHin.copy_to_host(dHin) # backprop the identity transforms into Hin dX[t] = dHin[0, 1:1 + d] if t > 0: dHout[t - 1] += dHin[0, 1 + d:] if drop_prob_encoder > 0: # backprop encoder dropout dX *= cache['U'] dWLSTM.copy_to_host(dWLSTMCp) return { 'WLSTM': dWLSTMCp, 'Wd': dWd, 'bd': dbd, 'dXi': dX[0, :], 'dXs': dX[1:, :] }
def main(*args): OPT_N = 4000000 iterations = 10 if len(args) >= 2: iterations = int(args[0]) blockdim = 1024, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 # Use cuRand to generate random numbers directyl on the gpu # to avoid memory transfers. prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) time0 = time.time() # malloc d_stockPrice = cuda.device_array(shape=(OPT_N), dtype=np.float32) d_optionStrike = cuda.device_array(shape=(OPT_N), dtype=np.float32) d_optionYears = cuda.device_array(shape=(OPT_N), dtype=np.float32) # Base distribution prng.uniform(d_stockPrice) prng.uniform(d_optionStrike) prng.uniform(d_optionYears) stream = cuda.stream() cfg_distribute = c_distribute[griddim, blockdim, stream] cfg_distribute(d_stockPrice, 5.0, 30.0) cfg_distribute(d_optionStrike, 1.0, 100.0) cfg_distribute(d_optionYears, 0.25, 10.) stream.synchronize() callResultNumbapro = np.zeros(OPT_N) putResultNumbapro = -np.ones(OPT_N) d_callResult = cuda.to_device(callResultNumbapro, stream) d_putResult = cuda.to_device(putResultNumbapro, stream) time1 = time.time() # Preconfigure the kernel as it's called multiple times in a loop. cfg_black_scholes_cuda = black_scholes_cuda[griddim, blockdim, stream] for i in range(iterations): cfg_black_scholes_cuda(d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY) d_callResult.to_host(stream) d_putResult.to_host(stream) stream.synchronize() time2 = time.time() dt = (time1 - time0) * 10 + (time2 - time1) print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [ int(math.ceil(float(partlen) / blksz)) for partlen in partlens ] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [ curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist ] # Allocate device side array d_normlist = [ cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist) ] c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [ cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist) ] d_lastlist = [ cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for j in xrange(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [ cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist) ] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def backward(dY, cache,g_WLSTM): Wd = cache['Wd'] Hout = cache['Hout'] IFOG = cache['IFOG'] IFOGf = cache['IFOGf'] C = cache['C'] Hin = cache['Hin'] g_Hin = cuda.to_device(np.asfortranarray(Hin.T)) WLSTM = cache['WLSTM'] X = cache['X'] tanhC_version = cache['tanhC_version'] drop_prob_encoder = cache['drop_prob_encoder'] drop_prob_decoder = cache['drop_prob_decoder'] n,d = Hout.shape # we have to add back a row of zeros, since in the forward pass # this information was not used. See NOTE1 above. dY = np.row_stack([np.zeros(dY.shape[1]), dY]) # backprop the decoder dWd = Hout.transpose().dot(dY) dbd = np.sum(dY, axis=0, keepdims = True) dHout = dY.dot(Wd.transpose()) # backprop dropout, if it was applied if drop_prob_decoder > 0: dHout *= cache['U2'] # backprop the LSTM dIFOG = np.array(np.zeros(IFOG.shape),order='F') dIFOGf = np.zeros(IFOGf.shape) dWLSTMCp = np.array(np.zeros(WLSTM.shape),order='F') dWLSTM = cuda.device_array(dWLSTMCp.shape,order='F') dWLSTM.copy_to_device(dWLSTMCp) dHin = np.array(np.zeros((1,Hin.shape[1])),order='F') g_dHin = cuda.device_array((1,Hin.shape[1]),order='F') dC = np.zeros(C.shape) dX = np.zeros(X.shape) for t in reversed(xrange(n)): if tanhC_version: tanhCt = np.tanh(C[t]) # recompute this here dIFOGf[t,2*d:3*d] = tanhCt * dHout[t] # backprop tanh non-linearity first then continue backprop dC[t] += (1-tanhCt**2) * (IFOGf[t,2*d:3*d] * dHout[t]) else: dIFOGf[t,2*d:3*d] = C[t] * dHout[t] dC[t] += IFOGf[t,2*d:3*d] * dHout[t] if t > 0: dIFOGf[t,d:2*d] = C[t-1] * dC[t] dC[t-1] += IFOGf[t,d:2*d] * dC[t] dIFOGf[t,:d] = IFOGf[t, 3*d:] * dC[t] dIFOGf[t, 3*d:] = IFOGf[t,:d] * dC[t] # backprop activation functions dIFOG[t,3*d:] = (1 - IFOGf[t, 3*d:] ** 2) * dIFOGf[t,3*d:] y = IFOGf[t,:3*d] dIFOG[t,:3*d] = (y*(1.0-y)) * dIFOGf[t,:3*d] # backprop matrix multiply #dWLSTM += np.outer(Hin[t], dIFOG[t]) #dHin[t] = dIFOG[t].dot(WLSTM.transpose()) g_dIFOG = cuda.to_device(dIFOG[t:t+1]) g_dHin, dWLSTM = backMultSubroutine(g_Hin[:,t:t+1],g_WLSTM,g_dIFOG,dWLSTM,g_dHin) g_dHin.copy_to_host(dHin) # backprop the identity transforms into Hin dX[t] = dHin[0,1:1+d] if t > 0: dHout[t-1] += dHin[0,1+d:] if drop_prob_encoder > 0: # backprop encoder dropout dX *= cache['U'] dWLSTM.copy_to_host(dWLSTMCp) return { 'WLSTM': dWLSTMCp, 'Wd': dWd, 'bd': dbd, 'dXi': dX[0,:], 'dXs': dX[1:,:] }
def add(self, a, b, out = None, alpha = 1., beta = 1.): """Pointwise addition of two scalars, 1D, or 2D arrays. Behaves like numpy array in terms of broadcasting. Parameters ---------- a : array-like Array to add. b : array-like Array to add. out : DeviceNDArray (optional) Result will overwrite out if given. alpha : float (optional) Scales a before addition. beta : float Scales b before addition. """ b, out_dtype = _check_array(b) a, out_dtype = _check_array(a) if type(out) == cuda.cudadrv.devicearray.DeviceNDArray: pass elif out is None: pass else: raise NotImplementedError a_dim = a.shape b_dim = b.shape # Matrix-matrix addition if a.ndim == 2 and b.ndim == 2: # Full-size matricies if a_dim == b_dim: if out is None: out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]: pass else: raise ValueError('matrices are not aligned') self.blas.geam('N', 'N', a_dim[0], a_dim[1], alpha, a, beta, b, out) # np.newaxis matrices elif a_dim[0] == b_dim[0] and b_dim[1] == 1: if out is None: out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[1]))) if alpha != 1. or beta != 1.: m_mn_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out) else: m_mn_add_pointwise[griddim,blockdim, self.stream](a,b,out) elif a_dim[1] == b_dim[1] and b_dim[0] == 1: if out is None: out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F') elif out.shape == a_dim: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[1]))) if alpha != 1. or beta != 1.: m_nm_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out) else: m_nm_add_pointwise[griddim,blockdim, self.stream](a,b,out) elif b_dim[0] == a_dim[0] and a_dim[1] == 1: if out is None: out = cuda.device_array((b_dim[0], b_dim[1]), dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[1]))) if alpha != 1. or beta != 1.: m_mn_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out) else: m_mn_add_pointwise[griddim,blockdim, self.stream](b,a,out) elif b_dim[1] == a_dim[1] and a_dim[0] == 1: if out is None: out = cuda.device_array((b_dim[0], b_dim[1]), dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[1]))) if alpha != 1. or beta != 1.: m_nm_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out) else: m_nm_add_pointwise[griddim,blockdim, self.stream](b,a,out) else: raise ValueError('matrices are not aligned') # Vector-vector addition elif a.ndim == 1 and b.ndim == 1: if a_dim[0] != b_dim[0]: raise ValueError('matricies not aligned') if out is None: out = cuda.device_array(a_dim[0], dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0]: pass else: raise ValueError('matrices are not aligned') blockdim = 32 griddim = int(ceil(a_dim[0]/blockdim)) if alpha != 1. or beta != 1.: vsadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out) else: vadd_pointwise[griddim,blockdim, self.stream](a,b,out) # Matrix-scalar addition elif a.ndim == 2 and b.ndim == 0: if out is None: out = cuda.device_array(a_dim, dtype=out_dtype, order='F') elif out.shape == a_dim: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[0]))) if alpha != 1. or beta != 1.: ms_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out) else: ms_add_pointwise[griddim,blockdim, self.stream](a,b,out) # Scalar-matrix addition elif a.ndim == 0 and b.ndim == 2: if out is None: out = cuda.device_array(b_dim, dtype=out_dtype, order='F') elif out.shape == b_dim: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[0]))) if alpha != 1. or beta != 1.: ms_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out) else: ms_add_pointwise[griddim,blockdim, self.stream](b,a,out) # Vector-scalar addition elif a.ndim == 1 and b.ndim == 0: if out is None: out = cuda.device_array(a_dim, dtype=out_dtype, order='F') elif out.shape == a_dim: pass else: raise ValueError('matrices are not aligned') blockdim = 32 griddim = int(ceil(a_dim[0]/blockdim)) if alpha != 1. or beta != 1.: vs_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out) else: vs_add_pointwise[griddim,blockdim, self.stream](a,b,out) # Scalar-vector addition elif a.ndim == 0 and b.ndim == 1: if out is None: out = cuda.device_array(b_dim, dtype=out_dtype, order='F') elif out.shape == b_dim: pass else: raise ValueError('matrices are not aligned') blockdim = 32 griddim = int(ceil(b_dim[0]/blockdim)) if alpha != 1. or beta != 1.: vs_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out) else: vs_add_pointwise[griddim,blockdim, self.stream](b,a,out) # Matrix-vector addition elif a.ndim == 2 and b.ndim == 1: if out is None: out = cuda.device_array(a_dim, dtype=out_dtype, order='F') elif out.shape == a_dim: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(a_dim[0]/blockdim[0])),int(ceil(a_dim[1]/blockdim[0]))) if b.shape[0] == a.shape[0]: if alpha != 1. or beta != 1.: mv0_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out) else: mv0_add_pointwise[griddim,blockdim, self.stream](a,b,out) elif b.shape[0] == a.shape[1]: if alpha != 1. or beta != 1.: mv1_sadd_pointwise[griddim,blockdim, self.stream](a,b,alpha,beta,out) else: mv1_add_pointwise[griddim,blockdim, self.stream](a,b,out) else: raise ValueError('matricies are not aligned') # Vector-matrix addition elif a.ndim == 1 and b.ndim == 2: if out is None: out = cuda.device_array(b_dim, dtype=out_dtype, order='F') elif out.shape == b_dim: pass else: raise ValueError('matrices are not aligned') blockdim = (32,32) griddim = (int(ceil(b_dim[0]/blockdim[0])),int(ceil(b_dim[1]/blockdim[0]))) if a.shape[0] == b.shape[0]: if alpha != 1. or beta != 1.: mv0_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out) else: mv0_add_pointwise[griddim,blockdim, self.stream](b,a,out) elif a.shape[0] == b.shape[1]: if alpha != 1. or beta != 1.: mv1_sadd_pointwise[griddim,blockdim, self.stream](b,a,beta,alpha,out) else: mv1_add_pointwise[griddim,blockdim, self.stream](b,a,out) else: raise ValueError('matricies are not aligned') else: raise NotImplementedError return out
def spca_full(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int(math.ceil((4. / epsilon)**d)) print(initNumSamples) maxSize = 6400 ##actual algorithm opt_x = np.zeros((p, 1), dtype=float_dtype) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = cuda.stream() # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr, # descending=True) prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A # print(Vd.dtype) # print('dA', (Vd.shape[0], numSamples)) # print('dI', (k, numSamples)) dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F', dtype=Vd.dtype) dI = cuda.device_array(shape=(Vd.shape[0], numSamples), dtype=np.uint32, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype) dC = cuda.device_array(shape=(d, numSamples), order='F', dtype=Vd.dtype) #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = Vd.dot(c) # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time nn = Vd.shape[0] segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn blksz = 32 init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)), (blksz, blksz), custr](dI) segmented_sort(dA, dI, segments, stream=custr) # async_dA = dA.bind(custr) # async_dI = dI.bind(custr) # selnext = sorter.batch_argselect(dtype=dA.dtype, # count=dA.shape[0], # k=k, # reverse=True) # for i in range(numSamples): # dIi = selnext(async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # for i in range(numSamples): # # radix_argselect(async_dA[:, i], k=k, stream=custr, # # storeidx=async_dI[:, i]) # dIi = sorter.argselect(k, async_dA[:, i]) # async_dI[:, i].copy_to_device(dIi, stream=custr) # Replaces: val = np.linalg.norm(a[I[-k:]]) # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, # daInorm) dA = dA.bind(custr)[-k:] dI = dI.bind(custr)[-k:] batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need Ik = dI[:, i].copy_to_host() aIk = dA[:, i].copy_to_host().reshape(k, 1) opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x
def fista(I, Phi, lambdav, L=None, tol=10e-6, max_iterations=200, display=True, verbose=False): b = cublas.Blas() c = cusparse.Sparse() descr = c.matdescr() (m, n) = Phi.shape (m, batch) = I.shape if L == None: L = scipy.sparse.linalg.svds(Phi, 1, which='LM', return_singular_vectors=False) print "Max eigenvalue: ." + str(L) L = (L**2)*4 # L = svd(Phi) -> eig(2*(Phi.T*Phi)) invL = 1/L t = 1. #if sps.issparse(Phi): # Phi = np.array(Phi.todense()) d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F')) # d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F')) d_Phi = cusparse.csr_matrix(Phi, dtype=np.float32) d_PhiT = cusparse.csr_matrix(Phi.T, dtype=np.float32) # hack because csrgemm issues with 'T' # d_Q = cuda.device_array((n, n), dtype=np.float32, order='F') d_c = cuda.device_array((n, batch), dtype=np.float32, order='F') d_x = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_y = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_x2 = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) # Temporary array variables d_t = cuda.device_array((m, batch), dtype=np.float32, order='F') d_t2 = cuda.device_array(n*batch, dtype=np.float32, order='F') #b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q) # Q = Phi^T * Phi #b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c) # c = -2*Phi^T * y # c.csrgemm('T', 'N', n, n, m, descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices, # descr, d_Phi.nnz, d_Phi.data, d_Phi.indptr, d_Phi.indices, descr, d_Q.data, d_Q.indptr, d_Q.indices) d_Q = c.csrgemm_ez(d_PhiT, d_Phi, transA='N', transB='N') c.csrmm('T', m, batch, n, d_Phi.nnz, -2, descr, d_Phi.data, d_Phi.indptr, d_Phi.indices, d_I, m, 0, d_c, n) blockdim = 32, 32 griddim = int(math.ceil(n/blockdim[0])), int(math.ceil(batch/blockdim[1])) blockdim_1d = 256 griddim_1d = int(math.ceil(n*batch/blockdim_1d)) start = l2l1obj(b, c, descr, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) obj2 = start for i in xrange(max_iterations): # x2 = 2*Q*y + c # b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2) c.csrmm('N', n, batch, n, d_Q.nnz, 2, descr, d_Q.data, d_Q.indptr, d_Q.indices, d_y, n, 0, d_x2, n) b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2) # x2 = y - invL * x2 b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2) # proxOp() l1prox[griddim, blockdim](d_x2, invL*lambdav, d_x2) t2 = (1+math.sqrt(1+4*(t**2)))/2.0 # y = x2 + ((t-1)/t2)*(x2-x) b.geam('N', 'N', n, batch, 1+(t-1)/t2, d_x2, (1-t)/t2, d_x, d_y) # x = x2 b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x) t = t2 # update objective obj = obj2 obj2 = l2l1obj(b, c, descr, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) if verbose: x2 = d_x2.copy_to_host() print "L1 Objective: " + str(obj2) # print "L1 Objective: " + str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2)) if np.abs(obj-obj2)/float(obj) < tol: break x2 = d_x2.copy_to_host() if display: print "FISTA Iterations: " + str(i) # print "L1 Objective: " + str(obj2) print "L1 Objective: " + str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2)) print "Objective delta: " + str(obj2-start) return x2
pricePath = [] for j in range(paths): print "Generating path: %s" % j # plotting lists LogReturns, nLogReturns = [0], [0] # log returns, normalized log returns xchange, xcorrelation = [], [] # change in price P, and autocorrelation activeTraders = [] # number of active traders prices = [initialPrice] price = initialPrice for i in range(steps): # Allocate device side array enterProbs = cuda.device_array(w*h, dtype=np.double, stream=stream) activateProbs = cuda.device_array(w*h, dtype=np.double, stream=stream) choiceProbs = cuda.device_array(w*h, dtype=np.double, stream=stream) diffuseProbs = cuda.device_array(w*h, dtype=np.double, stream=stream) #calculate cluster info cluster, clusterSize, nClust, nClustOnes = calcCluster(A) # get cluster info xis = cuda.device_array(nClust, dtype=np.double, stream = stream) eta = cuda.device_array(w*h, dtype=np.double, stream = stream) # 1 -> w*h*w*h, how to index??? with stream.auto_synchronize(): dA = cuda.to_device(A, stream) #upldate grid dB = cuda.to_device(B, stream) #upload new locatoin dCluster = cuda.to_device(cluster, stream) #upload cluster grid to GPU
def score_sequence(seq, pssm, verbose = False, keep_strands = True, benchmark = False, blocks_per_grid = -1, threads_per_block = -1): """ This function will score a sequence of nucleotides based on a PSSM by using a sliding window parallelized on a GPU. Args: seq: This must be an integer representation of the nucleotide sequence, where the alphabet is (A = 0, C = 1, G = 2, T = 3). It must be a vector (1D array) of integers that can be cast to int32 (See: numpy.int32). pssm: This must a vectorized PSSM where every four elements correspond to one position. Make sure this can be cast to an array of float64. verbose: Set this to True to print performance information. benchmark: If set to True, the function will return information about the run in a dictionary at the third output variable. keep_strands: Whether memory should be allocated for storing which strand the scores come from. Set this to False if you just want the scores and the strands array will not be returned. NOTE: If this and benchmark are set to False, then the scores will not be returned in a tuple, meaning: >>> score_sequence blocks_per_grid: This is the blocks per grid that will be assigned to the CUDA kernel. See this SO question for info on choosing this value: http://stackoverflow.com/questions/4391162/cuda-determining-threads-per-block-blocks-per-grid It defaults to the length of the sequence or the maximum number of blocks per grid supported by the GPU, whichever is lower. Set this to a negative number threads_per_block: Threads per block. See above. It defaults to 55% of the maximum number of threads per block supported by the GPU, a value determined experimentally. Higher values will likely result in failure to allocate resources to the kernel (since there will not be enough register space for each thread). Returns: scores: 1D float64 array of length (n - w + 1), where n is the length of the sequence and w is the window size. The value at index i of this array corresponds to the score of the n-mer at position i in the sequence. strands: 1D int32 array of length (n - w + 1). The value at position i is either 0 or 1 corresponding to the strand of the score at that position where 0 means the forward strand and 1 means reverse. run_info: This is a dictionary that is returned if the benchmark parameter is set to True. It contains the following: >>> run_info.keys() ['memory_used', 'genome_size', 'runtime', 'threads_per_block', 'blocks_per_grid'] Note that the memory_used is rather misleading if running the function more than once. CUDA is optimized to not transfer the same data from the host to the device so it will not always change. It may also unload other assets from memory, so the memory changed can be negative. TODO: Find a better method of calculating memory usage. Example: >>> pssm = np.random.uniform(-7.5, 2.0, 4 * 16) # Window size of 16 >>> seq = np.random.randint(0, 3, 30e6) # Generate random 30 million bp sequence >>> scores, strands, run_info = score_sequence(seq, pssm, benchmark=True, verbose=True) Threads per block = 563 Blocks per grid = 53286 Total threads = 30000018 Scoring... Done. Genome size: 3e+07 bp Time: 605.78 ms Speed: 4.95229e+07 bp/sec >>> scores array([-16.97089798, -33.48925866, -21.80381526, ..., -10.27919401, -32.64575614, -23.97110103]) >>> strands array([1, 1, 1, ..., 1, 1, 0]) >>> run_info {'memory_used': 426508288L, 'genome_size': 30000000, 'runtime': 0.28268090518054123, 'threads_per_block': 563, 'blocks_per_grid': 53286} A more interesting interpretation of the run information for performance analysis is the number of bases score per second: >>> print "%g bases/sec" % run_info["genome_size"] / run_info["runtime"] 1.06127e+08 bases/sec """ w = int(pssm.size / 4) # width of PSSM n = int(seq.size) # length of the sequence being scored # Calculate the reverse-complement of the PSSM pssm_r = np.array([pssm[i / 4 + (3 - (i % 4))] for i in range(pssm.size)][::-1]) # Calculate the appropriate threads per block and blocks per grid if threads_per_block <= 0 or blocks_per_grid <= 0: # We don't use the max number of threads to avoid running out of # register space by saturating the streaming multiprocessors # ~55% was found empirically, but your mileage may vary with different GPUs threads_per_block = int(cuda.get_current_device().MAX_BLOCK_DIM_X * 0.55) # We saturate our grid and let the dynamic scheduler assign the blocks # to the discrete CUDA cores/streaming multiprocessors blocks_per_grid = int(math.ceil(float(n) / threads_per_block)) if blocks_per_grid > cuda.get_current_device().MAX_GRID_DIM_X: blocks_per_grid = cuda.get_current_device().MAX_GRID_DIM_X if verbose: print "Threads per block = %d" % threads_per_block print "Blocks per grid = %d" % blocks_per_grid print "Total threads = %d" % (threads_per_block * blocks_per_grid) # Collect benchmarking info s = default_timer() start_mem = cuda.current_context().get_memory_info()[0] # Start a stream stream = cuda.stream() # Copy data to device d_pssm = cuda.to_device(pssm.astype(np.float64), stream) d_pssm_r = cuda.to_device(pssm_r.astype(np.float64), stream) d_seq = cuda.to_device(seq.astype(np.int32), stream) # Allocate memory on device to store results d_scores = cuda.device_array(n - w + 1, dtype=np.float64, stream=stream) if keep_strands: d_strands = cuda.device_array(n - w + 1, dtype=np.int32, stream=stream) # Run the kernel if keep_strands: cuda_score[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores, d_strands) else: cuda_score_without_strands[blocks_per_grid, threads_per_block](d_pssm, d_pssm_r, d_seq, d_scores) # Copy results back to host scores = d_scores.copy_to_host(stream=stream) if keep_strands: strands = d_strands.copy_to_host(stream=stream) stream.synchronize() # Collect benchmarking info end_mem = cuda.current_context().get_memory_info()[0] t = default_timer() - s # Output info on the run if verbose parameter is true if verbose: print "Genome size: %g bp" % n print "Time: %.2f ms (using time.%s())" % (t * 1000, default_timer.__name__) print "Speed: %g bp/sec" % (n / t) print "Global memory: %d bytes used (%.2f%% of total)" % \ (start_mem - end_mem, float(start_mem - end_mem) * 100 / cuda.get_current_device().get_context().get_memory_info()[1]) # Return the run information for benchmarking run_info = {"genome_size": n, "runtime": t, "memory_used": start_mem - end_mem, \ "blocks_per_grid": blocks_per_grid, "threads_per_block": threads_per_block} # I'm so sorry BDFL, please don't hunt me down for returning different size # tuples in my function if keep_strands: if benchmark: return (scores, strands, run_info) else: return (scores, strands) else: if benchmark: return (scores, run_info) else: # Careful! This won't return a tuple, so you don't need to do # score_sequence[0] to get the scores return scores
def mult(self, a, b, out=None, alpha=None): """Pointwise multiplication of two 1D or 2D arrays. Parameters ---------- a : array-like Array to multiply. b : array-like Array to multiply. out : DeviceNDArray (optional) Result will overwrite out if given. alpha : float Additional scale factor for multiplication. """ if alpha is not None: raise NotImplementedError b, out_dtype = _check_array(b) a, out_dtype = _check_array(a) if type(out) == cuda.cudadrv.devicearray.DeviceNDArray: pass elif out is None: pass else: raise NotImplementedError if b.dtype == np.float32: pass else: raise NotImplementedError a_dim = a.shape b_dim = b.shape if a.ndim == 2 and b.ndim == 2: if a_dim[0] != b_dim[0] and a_dim[1] != b_dim[1]: raise ValueError('matrices are not aligned') if out is None: out = cuda.device_array((a_dim[0], a_dim[1]), dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0] and out.shape[1] == a_dim[1]: pass else: raise ValueError('matrices are not aligned') blockdim2 = (32,32) griddim2 = (int(ceil(a_dim[0]/blockdim2[0])),int(ceil(a_dim[1]/blockdim2[1]))) mmultiply_pointwise[griddim2,blockdim2, self.stream](a,b,out) elif a.ndim == 1 and b.ndim == 1: if a_dim[0] != b_dim[0]: raise ValueError('matricies not aligned') if out is None: out = cuda.device_array(a_dim[0], dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0]: pass else: raise ValueError('matrices are not aligned') blockdim = 32 griddim = int(ceil(a_dim[0]/blockdim)) vmultiply_pointwise[griddim,blockdim, self.stream](a,b,out) else: raise NotImplementedError return out
def fista(I, Phi, lambdav, L=None, tol=10e-6, max_iterations=200, display=True, verbose=False): """ I: Images Phi: Dictionary lambdav: Sparse Penalty L = Largest eigenvalue of Phi """ b = numbapro.cudalib.cublas.Blas() (m, n) = Phi.shape (m, batch) = I.shape if L == None: L = scipy.sparse.linalg.svds(Phi, 1, which='LM', return_singular_vectors=False) print "Max eigenvalue: ." + str(L) L = (L**2) * 2 # L = svd(Phi) -> eig(2*(Phi.T*Phi)) invL = 1 / L t = 1. if sps.issparse(Phi): Phi = np.array(Phi.todense()) d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F')) d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F')) d_Q = cuda.device_array((n, n), dtype=np.float32, order='F') d_c = cuda.device_array((n, batch), dtype=np.float32, order='F') d_x = cuda.to_device( np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_y = cuda.to_device( np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_x2 = cuda.to_device( np.array(np.zeros((n, batch), dtype=np.float32), order='F')) # Temporary array variables d_t = cuda.device_array((m, batch), dtype=np.float32, order='F') d_t2 = cuda.device_array(n * batch, dtype=np.float32, order='F') b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q) # Q = Phi^T * Phi b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c) # c = -2*Phi^T * y blockdim = 32, 32 griddim = int(math.ceil(n / blockdim[0])), int( math.ceil(batch / blockdim[1])) blockdim_1d = 256 griddim_1d = int(math.ceil(n * batch / blockdim_1d)) start = l2l1obj(b, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) obj2 = start for i in xrange(max_iterations): # x2 = 2*Q*y + c b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2) b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2) # x2 = y - invL * x2 b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2) # proxOp() l1prox[griddim, blockdim](d_x2, invL * lambdav, d_x2) t2 = (1 + math.sqrt(1 + 4 * (t**2))) / 2.0 # y = x2 + ((t-1)/t2)*(x2-x) b.geam('N', 'N', n, batch, 1 + (t - 1) / t2, d_x2, (1 - t) / t2, d_x, d_y) # x = x2 b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x) t = t2 # update objective obj = obj2 obj2 = l2l1obj(b, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) if verbose: x2 = d_x2.copy_to_host() print "L1 Objective: " + str(obj2) if np.abs(obj - obj2) / float(obj) < tol: break x2 = d_x2.copy_to_host() if display: print "FISTA Iterations: " + str(i) print "L1 Objective: " + str(lambdav * np.sum(np.abs(x2)) + np.sum((I - Phi.dot(x2))**2)) print "Objective delta: " + str(obj2 - start) return x2
from numbapro import cuda from PIL import Image @cuda.jit("void(float32[:], float32[:])", target="gpu") def blur(input_img, blurred_img): index = cuda.grid(1) if(index >= input_img.shape[0]): return blurred_img[index] = 4.0 if __name__ == "__main__": img = np.ones(100) blurred_img = np.zeros(100) d_input_img = cuda.to_device(img) d_blurred_img = cuda.device_array(img.shape[0]) threads_per_block = 256 n_blocks = (img.shape[0] + threads_per_block-1) / threads_per_block for num in blurred_img: print num blur[n_blocks,threads_per_block](d_input_img, d_blurred_img) print("####################") d_blurred_img.copy_to_host(blurred_img) for num in blurred_img: print num print("Finished")
def spca(Vd, epsilon=0.1, d=3, k=10): p = Vd.shape[0] initNumSamples = int((4. / epsilon) ** d) maxSize = 32000 ##actual algorithm opt_x = np.zeros((p, 1)) opt_v = -np.inf # Send Vd to GPU dVd = cuda.to_device(Vd) remaining = initNumSamples custr = cuda.stream() prng = curand.PRNG(stream=custr) while remaining: numSamples = min(remaining, maxSize) remaining -= numSamples # Prepare storage for vector A dA = cuda.device_array(shape=(Vd.shape[0], numSamples), order='F') dI = cuda.device_array(shape=(k, numSamples), dtype=np.int16, order='F') daInorm = cuda.device_array(shape=numSamples, dtype=np.float64) dC = cuda.device_array(shape=(d, numSamples), order='F') #GENERATE ALL RANDOM SAMPLES BEFORE # Also do normalization on the device prng.normal(dC.reshape(dC.size), mean=0, sigma=1) norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d) #C = dC.copy_to_host() # Replaces: a = Vd.dot(c) # XXX: Vd.shape[0] must be within compute capability requirement # Note: this kernel can be easily scaled due to the use of num of samples # as the ncta batch_matmul[numSamples, 512, custr](dVd, dC, dA) # Replaces: I = np.argsort(a, axis=0) # Note: the k-selection is dominanting the time batch_k_selection[numSamples, Vd.shape[0], custr](dA, dI, k) # Replaces: val = np.linalg.norm(a[I[-k:]]) batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI, daInorm) aInorm = daInorm.copy_to_host(stream=custr) custr.synchronize() for i in xrange(numSamples): val = aInorm[i] if val > opt_v: opt_v = val opt_x.fill(0) # Only copy what we need a = gpu_slice(dA, i).reshape(p, 1) Ik = gpu_slice(dI, i).reshape(k, 1) aIk = a[Ik] opt_x[Ik] = (aIk / val) # Free allocations del dA, dI, daInorm, dC return opt_x
def fista(I, Phi, lambdav, L=None, tol=10e-6, max_iterations=200, display=True, verbose=False): """ I: Images Phi: Dictionary lambdav: Sparse Penalty L = Largest eigenvalue of Phi """ b = numbapro.cudalib.cublas.Blas() (m, n) = Phi.shape (m, batch) = I.shape if L == None: L = scipy.sparse.linalg.svds(Phi, 1, which='LM', return_singular_vectors=False) print "Max eigenvalue: ." + str(L) L = (L**2)*2 # L = svd(Phi) -> eig(2*(Phi.T*Phi)) invL = 1/L t = 1. if sps.issparse(Phi): Phi = np.array(Phi.todense()) d_I = cuda.to_device(np.array(I, dtype=np.float32, order='F')) d_Phi = cuda.to_device(np.array(Phi, dtype=np.float32, order='F')) d_Q = cuda.device_array((n, n), dtype=np.float32, order='F') d_c = cuda.device_array((n, batch), dtype=np.float32, order='F') d_x = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_y = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) d_x2 = cuda.to_device(np.array(np.zeros((n, batch), dtype=np.float32), order='F')) # Temporary array variables d_t = cuda.device_array((m, batch), dtype=np.float32, order='F') d_t2 = cuda.device_array(n*batch, dtype=np.float32, order='F') b.gemm('T', 'N', n, n, m, 1, d_Phi, d_Phi, 0, d_Q) # Q = Phi^T * Phi b.gemm('T', 'N', n, batch, m, -2, d_Phi, d_I, 0, d_c) # c = -2*Phi^T * y blockdim = 32, 32 griddim = int(math.ceil(n/blockdim[0])), int(math.ceil(batch/blockdim[1])) blockdim_1d = 256 griddim_1d = int(math.ceil(n*batch/blockdim_1d)) start = l2l1obj(b, d_I, d_Phi, d_x, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) obj2 = start for i in xrange(max_iterations): # x2 = 2*Q*y + c b.symm('L', 'U', n, batch, 2, d_Q, d_y, 0, d_x2) b.geam('N', 'N', n, batch, 1, d_c, 1, d_x2, d_x2) # x2 = y - invL * x2 b.geam('N', 'N', n, batch, 1, d_y, -invL, d_x2, d_x2) # proxOp() l1prox[griddim, blockdim](d_x2, invL*lambdav, d_x2) t2 = (1+math.sqrt(1+4*(t**2)))/2.0 # y = x2 + ((t-1)/t2)*(x2-x) b.geam('N', 'N', n, batch, 1+(t-1)/t2, d_x2, (1-t)/t2, d_x, d_y) # x = x2 b.geam('N', 'N', n, batch, 1, d_x2, 0, d_x, d_x) t = t2 # update objective obj = obj2 obj2 = l2l1obj(b, d_I, d_Phi, d_x2, d_t, d_t2, lambdav, blockdim_1d, griddim_1d) if verbose: x2 = d_x2.copy_to_host() print "L1 Objective: " + str(obj2) if np.abs(obj-obj2)/float(obj) < tol: break x2 = d_x2.copy_to_host() if display: print "FISTA Iterations: " + str(i) print "L1 Objective: " + str(lambdav*np.sum(np.abs(x2)) + np.sum((I-Phi.dot(x2))**2)) print "Objective delta: " + str(obj2-start) return x2
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 #in_h = np.array([4, 7, 2, 6, 3, 5, 1, 0], dtype=np.uint32) out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = randint(0, 100) #NUM_ELEMENTS - i - 1 #in_h = np.array([6, 44, 71, 79, 94, 92, 12, 56, 47, 17, 81, 98, 84, 9, 85, 99], dtype=np.uint32) #in_h = np.array([85, 37, 50, 73, 51, 46, 62, 84, 65, 99, 76, 59, 73, 16, 27, 4, 75, 81, 80, 33, 73, 11, 29, 24, 81, 49, 27, 71, 74, 64, 60, 91], dtype=np.uint32) print in_h in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int( ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) RadixGPU[number_of_blocks, threads_per_block](in_d, out_d, NUM_ELEMENTS) out_d.copy_to_host(out_h) #print "Rad = ", list(out_h) stride = 4 # while stride < NUM_ELEMENTS: # number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1) # bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride) # stride *= 2 # # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) # # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS) # # out_d = in_d # out_d.copy_to_host(out_h) # print "Str = ", list(out_h) # break # # stride /= 2 # while stride >= 4: # number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1) # bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride) # stride /= 2 # cuda.synchronize() # # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS) # out_d = in_d # # out_d.copy_to_host(out_h) # cuda.synchronize() # # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() #print "GPU = ", list(out_h) # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = list(in_h) #[NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() #print "CPU = ", in_cpu tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1 print len(in_cpu)
def dot(self, a, b, out=None): """Takes the dot product of two 2D arrays or 1D vectors. Checks array type and shape. Should behave like numpy.dot(a, b). Parameters ---------- a : array-like Numpy or DeviceNDArray b : array-like Numpy or DeviceNDArray out : DeviceNDArray (optional) Array will be filled with result if given. """ b, out_dtype = _check_array(b) a, out_dtype = _check_array(a) if isinstance(out, cuda.cudadrv.devicearray.DeviceNDArray): pass elif out is None: pass else: raise NotImplementedError if b.dtype == np.float32: pass else: raise NotImplementedError a_dim = a.shape b_dim = b.shape if a.ndim == 2 and b.ndim == 2: if a_dim[1] != b_dim[0]: raise ValueError('matrices are not aligned') if out is None: out = cuda.device_array((a_dim[0], b_dim[1]), dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0] and out.shape[1] == b_dim[1]: pass else: raise ValueError('matrices are not aligned') self.blas.gemm('N', 'N', a_dim[0], b_dim[1], a_dim[1], 1., a, b, 0., out) elif a.ndim == 2 and b.ndim == 1: if a_dim[1] != b_dim[0]: raise ValueError('matrices are not aligned') if out is None: out = cuda.device_array((a_dim[0]), dtype=out_dtype, order='F') elif out.shape[0] == a_dim[0]: pass else: raise ValueError('matrices are not aligned') self.blas.gemv('N', a_dim[0], a_dim[1], 1., a, b, 0., out) elif a.ndim == 1 and b.ndim == 2: if a_dim[0] != b_dim[0]: raise ValueError('matrices are not aligned') if out is None: out = cuda.device_array((b_dim[1]), dtype=out_dtype, order='F') elif out.shape[0] == b_dim[1]: pass else: raise ValueError('matrices are not aligned') self.blas.gemv('T', b_dim[0], b_dim[1], 1., b, a, 0., out) elif a.ndim == 1 and b.ndim == 1: if a_dim[0] != b_dim[0]: raise ValueError('matricies not aligned') out = self.blas.dot(a,b) else: raise NotImplementedError return out