def zoom_numbaThread(data, chunkIndices, zoomArray): """ 2-D zoom interpolation using purely python - fast if compiled with numba. Both the array to zoom and the output array are required as arguments, the zoom level is calculated from the size of the new array. Parameters: array (ndarray): The 2-D array to zoom zoomArray (ndarray): The array to place the calculation Returns: interpArray (ndarray): A pointer to the calculated ``zoomArray'' """ for i in range(chunkIndices[0], chunkIndices[1]): x = i*numba.float32(data.shape[0]-1)/(zoomArray.shape[0]-0.99999999) x1 = numba.int32(x) for j in range(zoomArray.shape[1]): y = j*numba.float32(data.shape[1]-1)/(zoomArray.shape[1]-0.99999999) y1 = numba.int32(y) xGrad1 = data[x1+1, y1] - data[x1, y1] a1 = data[x1, y1] + xGrad1*(x-x1) xGrad2 = data[x1+1, y1+1] - data[x1, y1+1] a2 = data[x1, y1+1] + xGrad2*(x-x1) yGrad = a2 - a1 zoomArray[i,j] = a1 + yGrad*(y-y1) return zoomArray
def bilinear_interp_numba_inbounds(data, xCoords, yCoords, chunkIndices, interpArray): """ 2-D interpolation using purely python - fast if compiled with numba This version also accepts a parameter specifying how much of the array to operate on. This is useful for multi-threading applications. Parameters: array (ndarray): The 2-D array to interpolate xCoords (ndarray): A 1-D array of x-coordinates yCoords (ndarray): A 2-D array of y-coordinates chunkIndices (ndarray): A 2 element array, with (start Index, stop Index) to work on for the x-dimension. interpArray (ndarray): The array to place the calculation Returns: interpArray (ndarray): A pointer to the calculated ``interpArray'' """ jRange = range(yCoords.shape[0]) for i in range(chunkIndices[0], chunkIndices[1]): x = xCoords[i] x1 = numba.int32(x) for j in jRange: y = yCoords[j] y1 = numba.int32(y) xGrad1 = data[x1 + 1, y1] - data[x1, y1] a1 = data[x1, y1] + xGrad1 * (x - x1) xGrad2 = data[x1 + 1, y1 + 1] - data[x1, y1 + 1] a2 = data[x1, y1 + 1] + xGrad2 * (x - x1) yGrad = a2 - a1 interpArray[i, j] = a1 + yGrad * (y - y1) return interpArray
def cu_mat_power(A, power, power_A): y, x = cuda.grid(2) m, n = power_A.shape if x >= n or y >= m: return power_A[y, x] = math.pow(A[y, x], int32(power))
def geometric_propagation_numba(phase_screens, metapupil_coords, output_phase, thread_indices): """ 2-D interpolation using purely python - fast if compiled with numba This version also accepts a parameter specifying how much of the array to operate on. This is useful for multi-threading applications. Parameters: array (ndarray): The 2-D array to interpolate xCoords (ndarray): A 1-D array of x-coordinates yCoords (ndarray): A 2-D array of y-coordinates chunkIndices (ndarray): A 2 element array, with (start Index, stop Index) to work on for the x-dimension. interpArray (ndarray): The array to place the calculation Returns: interpArray (ndarray): A pointer to the calculated ``interpArray'' """ jRange = range(metapupil_coords.shape[0]) for layer in range(phase_screens.shape[0]): if metapupil_coords[layer, 0, -1] == phase_screens.shape[1] - 1: metapupil_coords[layer, 0, -1] -= 1e-6 if metapupil_coords[layer, 1, -1] == phase_screens.shape[2] - 1: metapupil_coords[layer, 1, -1] -= 1e-6 for i in range(thread_indices[0], thread_indices[1]): print(i) x = metapupil_coords[layer, 0, i] x1 = numba.int32(x) for j in jRange: y = metapupil_coords[layer, 1, j] y1 = numba.int32(y) print(layer, x, y) xGrad1 = phase_screens[layer, x1 + 1, y1] - phase_screens[layer, x1, y1] a1 = phase_screens[layer, x1, y1] + xGrad1 * (x - x1) xGrad2 = phase_screens[layer, x1 + 1, y1 + 1] - phase_screens[layer, x1, y1 + 1] a2 = phase_screens[layer, x1, y1 + 1] + xGrad2 * (x - x1) yGrad = a2 - a1 output_phase[i, j] += a1 + yGrad * (y - y1) return output_phase
def test_4(self): sig = [ int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64), ] func = self.funcs['func3'] A = np.arange(100, dtype=np.float64) self._run_and_compare(func, sig, A, A) A = A.astype(np.float32) self._run_and_compare(func, sig, A, A) A = A.astype(np.int32) self._run_and_compare(func, sig, A, A) A = A.astype(np.uint32) self._run_and_compare(func, sig, A, A)
def _test_template_4(self, target): sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) np_ufunc = np.add def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) np.testing.assert_allclose(gold, result) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def sentiment_feature(basic_train,basic_test): @guvectorize([(int64[:] )], '(n),()->(n)') def compute_scores(x): if x['极性'] =='0' or x['极性'] =='3': return 0 flag =(1 if x['极性']=='1' else -1) return flag * int(x['强度']) @vectorize([int32()]) def compute_sentiment_scores(line): words = np.array(line.split(' ')) mask = sentiment['词语'].isin(words) if mask.any() == False : return 0 else: return sum(sentiment[mask].apply(compute_scores,axis=1)) with open('sentiment_word.csv') as file: result = [line.strip().split('\t') for line in file.readlines()] for line in result: if len(line)>10: result.remove(line) sentiment = pd.DataFrame(columns=result[0],data=result[1:]) sentiment['词语']=sentiment['词语'].map(lambda x:x.decode("utf-8")) sentiment.columns = range(10) sentiment.drop([7,8,9],axis=1,inplace=True) sentiment.columns = ['词语','词性','词义数','词义序号','分类','强度','极性'] sentiment.drop(['词性','词义数','词义序号','分类'],axis=1,inplace=True) train = basic_train[['clean&segment','pid']].copy() test = basic_test[['clean&segment','pid']].copy() train['sentiment'] = train['clean&segment'].map(compute_sentiment_scores) test['sentiment'] = test['clean&segment'].map(compute_sentiment_scores) train.drop('clean&segment',axis=1,inplace=True) test.drop('clean&segment',axis=1,inplace=True) train.set_index('pid',inplace=True) test.set_index('pid',inplace=True) return train,test
def template_vectorize(self, target): # build basic native code ufunc sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) # build python ufunc np_ufunc = np.add # test it out def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) self.assertTrue(np.allclose(gold, result)) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
return wij, B, proportions @njit def calc_mult(w, p): n = w.shape[0] wij=np.zeros((n,n), dtype=np.int32) for i in range(n): if w[i]!=0: wij[i,:]=np.random.multinomial(w[i], p[i]) else: wij[i,:]=np.zeros(n) return wij #from numba import vectorize, int64, float64 @vectorize([int32(int32, int32)]) def vec_randunif(l, h): return np.random.uniform(l,h) @njit #@jit(locals={'new_observation': numba.types.int32[:]}, nopython=True) def calc(observation, action, price, action_L, high, p, distance_ij, lost_sales_cost ): epsilons = vec_randunif(action_L, high) demand = action + epsilons w = np.minimum(demand, observation) wij = calc_mult(w, p) num_lost_sales = demand - w dwij=np.multiply(distance_ij, wij)
xc -= xc_floor for i in range(yc_floor.shape[0]): for j in range(yc_floor.shape[1]): yf = min(Ly - 1, max(0, yc_floor[i, j])) xf = min(Lx - 1, max(0, xc_floor[i, j])) yf1 = min(Ly - 1, yf + 1) xf1 = min(Lx - 1, xf + 1) y = yc[i, j] x = xc[i, j] Y[i, j] = (np.float32(I[yf, xf]) * (1 - y) * (1 - x) + np.float32(I[yf, xf1]) * (1 - y) * x + np.float32(I[yf1, xf]) * y * (1 - x) + np.float32(I[yf1, xf1]) * y * x) @vectorize([int32(float32)], nopython=True) def nfloor(y): return math.floor(y) #np.int32(np.floor(y)) @njit([ 'int16[:, :,:], float32[:,:,:], float32[:,:,:], float32[:,:], float32[:,:], float32[:,:,:]', 'float32[:, :,:], float32[:,:,:], float32[:,:,:], float32[:,:], float32[:,:], float32[:,:,:]' ], parallel=True) def shift_coordinates(data, yup, xup, mshy, mshx, Y): """ shift data into yup and xup coordinates Parameters -------------
if thresholds is None: thresholds = cfg.SZO.EVALUATION.THRESHOLDS assert 5 == prediction.ndim assert 5 == truth.ndim assert prediction.shape == truth.shape assert prediction.shape[2] == 1 thresholds = [rainfall_to_pixel(thresholds[i]) for i in range(len(thresholds))] thresholds = sorted(thresholds) ret = _get_hit_miss_counts_numba(prediction=prediction, truth=truth, mask=mask, thresholds=thresholds) return ret[:, :, :, 0], ret[:, :, :, 1], ret[:, :, :, 2], ret[:, :, :, 3] @jit(int32(float32, float32, boolean, float32)) def _get_hit_miss_counts_numba(prediction, truth, mask, thresholds): seqlen, batch_size, _, height, width = prediction.shape threshold_num = len(thresholds) ret = np.zeros(shape=(seqlen, batch_size, threshold_num, 4), dtype=np.int32) for i in range(seqlen): for j in range(batch_size): for m in range(height): for n in range(width): if mask[i][j][0][m][n]: for k in range(threshold_num): bpred = prediction[i][j][0][m][n] >= thresholds[k] btruth = truth[i][j][0][m][n] >= thresholds[k] ind = (1 - btruth) * 2 + (1 - bpred) ret[i][j][k][ind] += 1
def hist(val, vmin, vptp, vres): return int32(((val - vmin) / vptp) * vres)
def decode(self): width = self.ihdr_info['width'] height = self.ihdr_info['height'] bit_depth = self.ihdr_info['bit depth'] color_type = self.ihdr_info['color type'] pixel_width = self.__pixel_width def ifilter0(m, N, buf, bpp): '''Type 0: No filter''' return @numba.jit((int32, int32, int32[:, :], int32)) def ifilter1(m, N, buf, bpp): '''Type 1: Inverse Sub filter''' for k in range(bpp, N): buf[m, k] += buf[m, k-bpp] buf[m, k] &= 0xff @numba.jit((int32, int32, int32[:, :], int32)) def ifilter2(m, N, buf, bpp): '''Type 2: Inverse Up filter''' buf[m, :] += buf[(m-1), :] buf[m, :] &= 0xff @numba.jit((int32, int32, int32[:, :], int32)) def ifilter3(m, N, buf, bpp): '''Type 3: Inverse Average filter''' for k in range(bpp): buf[m, k] += buf[(m-1), k] // 2 buf[m, k] &= 0xff for k in range(bpp, N): buf[m, k] += (buf[m, (k-bpp)] + buf[(m-1), k]) // 2 buf[m, k] &= 0xff @numba.jit(int32(int32, int32, int32)) def predictor(a, b, c): '''Helper function for ifilter4. a = left, b = above, c = upper left.''' p = a + b -c pa = abs(p - a) pb = abs(p - b) pc = abs(p - c) if pa <= pb and pa <= pc: return a elif pb <= pc: return b else: return c @numba.jit((int32, int32, int32[:, :], int32)) def ifilter4(m, N, buf, bpp): '''Type 4: Inverse Paeth filter''' for k in range(bpp): buf[m, k] += buf[(m-1), k] buf[m, k] &= 0xff for k in range(bpp, N): buf[m, k] += predictor(buf[m, (k-bpp)], buf[(m-1), k], buf[(m-1), (k-bpp)]) buf[m, k] &= 0xff def ifilter(byte_stream): '''inverse filter before: decompressed data stream width: width of the image height: height of the image return value: data stream which has been inverse filtered''' bwidth = int(math.ceil(width * bit_depth * pixel_width[color_type] / 8.0)) bpp = self.__bpp filter_list = [ifilter0, ifilter1, ifilter2, ifilter3, ifilter4] buf = np.empty((height+1, bwidth+1), dtype=np.int) buf[0, :] = 0 buf[1:, :] = np.reshape(np.fromstring(byte_stream, dtype=np.ubyte), (height, bwidth+1)) for m in range(1, height+1): filter_type = buf[m, 0] if filter_type == 0: continue filter_list[filter_type](m, bwidth, buf[:, 1:], bpp) byte_mtx = np.empty((height, bwidth), dtype=np.ubyte) byte_mtx[:, :] = buf[1:, 1:] return byte_mtx def split_byte(b, width): mask = 2**width - 1 li = [] for k in range(8//width): li.append(b & mask) b >>= width li.reverse() return li def bytes_to_pixels(mtx, bit_depth, img_width): # if bit_depth < 8: # for idx, line in enumerate(mtx): # pixels = [] # for B in line: # pixels.extend(split_byte(B, bit_depth)) # pixels = pixels[:img_width] # mtx[idx] = pixels # if bit_depth == 16: # for idx, line in enumerate(mtx): # pixels = [] # for k in range(img_width): # pixels.append(line[2*k]*2**8 + line[2*k+1]) # mtx[idx] = pixels if bit_depth == 8: return mtx if bit_depth == 16: return mtx.view(np.ushort) # reinterpret_cast else: raise NotImplementedError com_stream = StringIO() with open(self.__filename, 'rb') as f: for chunk in self.__chunks: if chunk['type'] == 'IDAT': f.seek(chunk['data pos']) com_stream.write(f.read(chunk['len'])) byte_stream = zlib.decompress(com_stream.getvalue()) pix_mtx = ifilter(byte_stream) pix_mtx = bytes_to_pixels(pix_mtx, bit_depth, width) pixel_type = np.ubyte if bit_depth <=8 else np.ushort pix_mtx = np.array(pix_mtx, dtype=pixel_type) pix_mtx.shape = (height, width, pixel_width[color_type]) return pix_mtx
r = linspace(xmin, xmax, width) i = linspace(ymin, ymax, height) n = [[0] * width for _ in range(height)] for x in range(width): for y in range(height): n[y][x] = mandel_numba(complex(r[x], i[y]), maxiter) return n ############################################################################## #Numba Vectorize @vectorize([int32(complex64, int32)], target='parallel') def mandel_numba_vect(c, maxiter): nreal = 0 real = 0 imag = 0 for n in range(maxiter): nreal = real * real - imag * imag + c.real imag = 2 * real * imag + c.imag real = nreal if real * real + imag * imag > 4.0: #squared modulus return n return n def mandel_set_numba_vect(xmin, xmax, ymin, ymax, width, height, maxiter): r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
Args: x1 (array): First component of vector 1 y1 (array): Second component of vector 1 z1 (array): Third component of vector 1 x2 (array): First component of vector 2 y2 (array): Second component of vector 2 z2 (array): Third component of vector 2 Returns: r2 (array): Element-wise squared distance (see definition) .. math:: r2 = (x1 - x2)^{2} + (y1 - y2)^{2} + (z1 - z2)^{2} ''' return (x1 - x2)**2 + (y1 - y2)**2 + (z1 - z2)**2 if global_config['pkg_numba']: from numba import vectorize, float64, float32, int64, int32 vmag3 = vectorize([int32(int32, int32, int32), int64(int64, int64, int64), float32(float32, float32, float32), float64(float64, float64, float64)])(vmag3) vdist3 = vectorize([int32(int32, int32, int32, int32, int32, int32), int64(int64, int64, int64, int64, int64, int64), float32(float32, float32, float32, float32, float32, float32), float64(float64, float64, float64, float64, float64, float64)])(vdist3)
from numba import njit from numba import int32, float32, prange from numba.core import types from numba import typeof from numba.typed import List, Dict from numba.core.errors import TypingError from numba.tests.support import (TestCase, MemoryLeakMixin, override_config, forbid_codegen, skip_parfors_unsupported) from numba.core.unsafe.refcount import get_refcount from numba.experimental import jitclass # global typed-list for testing purposes global_typed_list = List.empty_list(int32) for i in (1, 2, 3): global_typed_list.append(int32(i)) def to_tl(l): """ Convert cpython list to typed-list. """ tl = List.empty_list(int32) for k in l: tl.append(k) return tl class TestTypedList(MemoryLeakMixin, TestCase): def test_basic(self): l = List.empty_list(int32) # len self.assertEqual(len(l), 0)
import numpy as np from numba import vectorize from numba import cuda, int32, float32, float64 from numba.cuda.testing import skip_on_cudasim from numba.cuda.testing import CUDATestCase from numba.core import config import unittest sig = [ int32(int32, int32), float32(float32, float32), float64(float64, float64) ] target = "cuda" if config.ENABLE_CUDASIM: target = "cpu" test_dtypes = np.float32, np.int32 @skip_on_cudasim("ufunc API unsupported in the simulator") class TestCUDAVectorize(CUDATestCase): N = 1000001 def test_scalar(self): @vectorize(sig, target=target) def vector_add(a, b): return a + b
eq_t[h] += 1.0 / nb_best_hand # impossible : error else: return -1 # normalize eq_w_agg and eq_t_agg for h in xrange(p): eq_agg[h, 0] = eq_w[h] / n eq_agg[h, 1] = eq_t[h] / n return eq_agg rank_fast = jit( int32(int32[:], int32[:], uint32[:], int32[:], int32, int32, int32[:], int32[:], int32[:]))(rank) exhaustive_block_fast = jit(int32[:](int32[:, :], int32[:], int32[:], uint32[:], int32[:], int32, int32, int32[:], int32[:], int32[:]))(exhaustive_block) def exhaustive_eval(player_card, table_card): """compute all possible games given the player/table cards (as a numbers from 0 to 51) and return equity win/tie for each player""" p = player_card.shape[0] equity_arr = np.zeros([p, 2], dtype=np.float32) print '\n---------------- Exhaustive eval start' print 'player_card=\n{}'.format(player_card) print 'table_card=\n{}'.format(table_card)
neighbours[6, 0] = i + 1 neighbours[7, 0] = i + 1 neighbours[0, 1] = j + 1 neighbours[1, 1] = j - 1 neighbours[2, 1] = j neighbours[3, 1] = j neighbours[4, 1] = j + 1 neighbours[5, 1] = j - 1 neighbours[6, 1] = j + 1 neighbours[7, 1] = j - 1 neighbours %= grid_length @jit(int32(int8[:, :], int32[:, :]), nopython=True) def count_neighbours(in_grid, neighbours): """Count the number of live neighbours of the site.""" count = np.int32(0) for n_count in range(neighbours.shape[0]): if in_grid[neighbours[n_count, 0], neighbours[n_count, 1]] == 1: count += 1 return count @jit((int32[:, :], int8[:, :], int8[:, :]), nopython=True) def grid_sweep(neighbours, in_grid, out_grid): """Sweep the grid once with game of life rules.""" for i in range(in_grid.shape[0]): for j in range(in_grid.shape[1]):
j += 1 t[j] = t0 # Update t0, y0, z0 t0, y0, z0 = t1, y1, z1 # end # Update y if last y0 is greater than (or equal) threshold if cmp2(h, abs(y0 - y[t[j]])): j += 1 t[j] = t0 return j + 1 return findrfc2 @jit(int32(float64, float64), nopython=True) def a_le_b(a, b): return a <= b @jit(int32(float64, float64), nopython=True) def a_lt_b(a, b): return a < b _findrfc_le = _make_findrfc(a_le_b, a_lt_b) _findrfc_lt = _make_findrfc(a_lt_b, a_le_b) @jit(int64(int64[:], float64[:], float64), nopython=True) def _findrfc(ind, y, h):
def cuda_ij_to_k(i, j): return int32(j + i * (i - 1) / 2)
def argcast(a, b): return argcast_inner(int32(a), b)
@jit(int64(int32, int32)) def EncodeMorton2D(x, y): """ Calculates the 2D morton code from the x, y dimensions Args: x (int): the x dimension y (int): the y dimension Returns: int: 64 bit morton code in 2D """ return Expand2D(x) + (Expand2D(y) << 1) @jit(int32(int64)) def Compact2D(m): """ Decodes the 64 bit morton code into a 32 bit number in the 2D space using a divide and conquer approach for separating the bits. 1 bit is not used because the integers are not unsigned Args: n (int): a 64 bit morton code Returns: int: a dimension in 2D space Raises: Exception: ERROR: Morton code is always positive """
ret += d**2 return ret @cuda.jit(nb_float(nb_float[:], nb_float[:], nb_float[:]), device=True) def cu_pbc_dist(a, b, box): ret = 0 for i in range(a.shape[0]): d = a[i] - b[i] d -= box[i] * floor(d / box[i] + 0.5) ret += d**2 return sqrt(ret) return cu_pbc_dist2, cu_pbc_dist_diameter, cu_pbc_dist @cuda.jit(int32(int32[:], int32[:]), device=True) def cu_ravel_index_f_pbc(i, dim): # ravel index in Fortran way. ret = (i[0] + dim[0]) % dim[0] tmp = dim[0] for k in range(1, dim.shape[0]): ret += ((i[k] + dim[k]) % dim[k]) * tmp tmp *= dim[k] return ret @cuda.jit(void(int32, int32[:], int32[:]), device=True) def cu_unravel_index_f(i, dim, ret): # unravel index in Fortran way. for k in range(dim.shape[0]): ret[k] = int(i % dim[k]) i = (i - ret[k]) / dim[k]
def foo(): l = listobject.new_list(types.unicode_type) l.append(int32(0))
def make_test_list(): l = listobject.new_list(int32) l.append(int32(1)) return l
xy + \\text{trunc}\\left(\\frac{\\left(\\left|x - y\\right| - 1\\right)^{2}}{4}\\right) Args: x (array): First value array y (array): Second value array Returns: p (array): Pairing function result Note: This function has a vectorized version that is imported as :func:`~exa.algorithms.indexing.unordered_pairing`; use that function when working with array data. .. _pairing function: http://www.mattdipasquale.com/blog/2014/03/09/unique-unordered-pairing-function/ ''' return np.int64(x * y + np.trunc((np.abs(x - y) - 1)**2 / 4)) if global_config['pkg_numba']: from numba import jit, vectorize, int32, int64, float32, float64 arange1 = jit(nopython=True, cache=True)(arange1) arange2 = jit(nopython=True, cache=True)(arange2) indexes_sc1 = jit(nopython=True, cache=True)(indexes_sc1) indexes_sc2 = jit(nopython=True, cache=True)(indexes_sc2) unordered_pairing = vectorize([int32(int32, int32), int64(int64, int64), float32(float32, float32), float64(float64, float64)], nopython=True)(unordered_pairing)
def foo(): l = make_test_list() l._make_immutable() l.append(int32(1))
#! /usr/bin/python # -*- coding: utf-8 -*- # ising.py # Version: 2018.11.16.01 # Requiered libraries from __future__ import division import time, random, math, time, sys, os import matplotlib.pyplot as plt from numba import jit, prange, njit, int32 import numpy as np from numpy.random import random as nrand @njit(int32(int32, int32, int32)) # Periodic Boundary Condition def PBC(idx, lim, add): return (idx + lim + add) % lim # Monte Carlo Alg def MC(temperature, spins, MCc): # Temperature, Number of Spins, Number of cycles MSpins = np.ones((spins, spins), np.int32) # Spins Matrix pointing up # Initialize Statistics E = 0. Eavr = 0. Evar = 0. E2av = 0. Mavr = 0.
""" Core math functions to compute escape times for the Mandelbrot set. """ from numba import float64, int32, jit from numpy import empty @jit(int32(float64, float64, int32)) def mandelbrot_escape(x, y, n): """Mandelbrot set escape time algorithm for a given c = x + i*y coordinate. Returnautojit, the number of iterations necessary to escape abouve a fixed threshold (4.0) by repeatedly applying the formula: z_0 = 0 z_n = z_{n-1} ^ 2 + c If the formula did not escape after `n` iterations, return -1 . Parameters ---------- x, y -- float Real and imaginary part of the complex number z. n -- integer Maximum number of iterations. """ z_x = 0 z_y = 0 for i in range(n): old_z_x = z_x
from __future__ import print_function, absolute_import import numpy as np from numba import vectorize from numba import cuda, int32, float32, float64 from numba import unittest_support as unittest from numba.cuda.testing import skip_on_cudasim from numba.cuda.testing import CUDATestCase from numba import config sig = [int32(int32, int32), float32(float32, float32), float64(float64, float64)] target='cuda' if config.ENABLE_CUDASIM: target='cpu' test_dtypes = np.float32, np.int32 @skip_on_cudasim('ufunc API unsupported in the simulator') class TestCUDAVectorize(CUDATestCase): N = 1000001 def test_scalar(self): @vectorize(sig, target=target)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Oct 25 20:50:52 2019 @author: daniel """ from numba import jit, vectorize, guvectorize, float64, complex64, int32, float32, int64 import numpy as np @jit(int32(complex64, int32)) def mandelbrot(c, maxiter): nreal = 0 real = 0 imag = 0 for n in range(maxiter): nreal = real * real - imag * imag + c.real imag = 2 * real * imag + c.imag real = nreal if real * real + imag * imag > 4.0: return n return 0 @guvectorize([(complex64[:], int32[:], int32[:])], '(n),()->(n)', target='parallel') def mandelbrot_numpy(c, maxit, output): maxiter = maxit[0]
if current_interval == result_buffer_size: return current_interval # Add bounds to result buffer result_buffer[current_interval, 0] = current_interval_start result_buffer[current_interval, 1] = itv_end current_interval += 1 if current_interval == result_buffer_size: return current_interval n_intervals = current_interval # No +1, as current_interval was incremented also when the last interval closed return n_intervals @numba.jit(numba.int32(numba.float64[:], numba.float64, numba.int64[:, :]), nopython=True) def find_intervals_above_threshold_no_splitting(w, threshold, result_buffer): """Fills result_buffer with l, r bounds of intervals in w > threshold. Unlike find_intervals_above_threshold(), does not smooth and split hits, which allows speed increase in ZLE simulation. :param w: Waveform to do hitfinding in :param threshold: Threshold for including an interval :param result_buffer: numpy N*2 array of ints, will be filled by function. if more than N intervals are found, none past the first N will be processed. :returns : number of intervals processed Boundary indices are inclusive, i.e. the right boundary is the last index which was > threshold """ result_buffer_size = len(result_buffer) last_index_in_w = len(w) - 1
#!/usr/bin/env python # coding: utf-8 import numpy as np from numba import jit, guvectorize, complex128, int32 # burning ship set @jit(int32(complex128, int32)) def burning_ship_iter(z, maxiter): nreal = 0 real = 0 imag = 0 for n in range(maxiter): real2 = real * real imag2 = imag * imag if real2 + imag2 > 4.0: return n imag = abs(2 * real * imag + z.imag) real = abs(real2 - imag2 + z.real) return 0 @guvectorize([(complex128[:], int32[:], int32[:])], '(n),()->(n)', target='parallel') def burning_ship_numpy(z, maxit, output): maxiter = maxit[0] for i in range(z.shape[0]): output[i] = burning_ship_iter(z[i], maxiter)
from numba import vectorize, int32, complex128 from __init__ import plot, field IMAX = 0xFFFF @vectorize([int32(complex128)], target="parallel") def mandelbrot_vector(c): z = 0 n = 0 while abs(z) <= 2 and n < IMAX: z = z * z + c n += 1 return n def main(): f = field(1024) m = mandelbrot_vector(f) m[m == IMAX] = 0 plot(m, "numba_vectorize.png") if __name__ == "__main__": main()
def cu_delay_cfun(horizon, cfpre, cfpost, n_cvar, n_thread_per_block, step_stride=0, aff_node_stride=0): "Construct CUDA device function for delayed coupling with given pre & post summation functions." if horizon < 2 or (horizon & (horizon - 1)) != 0: msg = "cu_delay_cfun argument `horizon` should be a positive power of 2, but received %d" msg %= horizon raise ValueError(msg) # 0 except for testing step_stride = int32(step_stride) aff_node_stride = int32(aff_node_stride) @cuda.jit(device=True) def dcfun(aff, delays, weights, state, i_post, i_thread, step, cvars, buf): #, delayed_step): # shared mem temporary for summation, indexed by block-local thread index aff_i = cuda.shared.array((n_cvar, n_thread_per_block), float32) i_t = cuda.threadIdx.x # 0 except for testing step_ = step_stride * step # update buffer with state for i_cvar in range(cvars.shape[0]): buf[i_post, _cu_mod_pow_2(step, horizon), i_cvar, i_thread] = state[step_, i_post, cvars[i_cvar], i_thread] # initialize sums to zero for i_cvar in range(cvars.shape[0]): aff_i[i_cvar, i_t] = float32(0.0) #aff[step_, i_post * aff_node_stride, i_cvar, i_thread*0] = float32(0.0) # query buffer, summing over cfpre applied to delayed efferent cvar values for i_pre in range(weights.shape[0]): weight = weights[i_post, i_pre] if weight == 0.0: continue # delayed_step[i_post, i_pre] = _cu_mod_pow_2(step - delays[i_post, i_pre] + horizon, horizon) delayed_step = _cu_mod_pow_2( step - delays[i_post, i_pre] + horizon, horizon) for i_cvar in range(cvars.shape[0]): cval = buf[i_pre, delayed_step, i_cvar, i_thread] #aff[step_, i_post * aff_node_stride, i_cvar, i_thread*0] += \ aff_i[i_cvar, i_t] += \ weight * cfpre(state[step_, i_post, cvars[i_cvar], i_thread], cval) # apply cfpost for i_cvar in range(cvars.shape[0]): # i_t use and i_thread for tests... aff[step_, i_post * aff_node_stride, i_cvar, i_t] = cfpost( aff_i[i_cvar, i_t] #aff[step_, i_post * aff_node_stride, i_cvar, i_thread*0] ) return dcfun
# # i = 0 # # while True: # mother = np.random.randint(0,len(fit)) # father = np.random.randint(0,len(fit)) # r_rec = recombine_arrays(population[:, mother], population[:, father]) # new_pop[:, i] = r_rec[:, 0] # new_pop[:, i+1] = r_rec[:, 1] # i += 2 # if i >= n: # break # # return new_pop @jit(int32(float32[:]),nopython=True) def weighted_choice(weights): i = 0 rnd = np.random.random() * np.sum(weights) for i in range(len(weights)): rnd -= weights[i] if rnd < 0: break return i @jit(float32[:,:](float32[:,:],float32[:]),nopython=True) def recombine_population(population,fitness): weights = fitness.max()-fitness new_pop = np.zeros(population.shape,dtype=np.float32) n = population.shape[1] fit = np.arange(0,int(n/20))
from __future__ import print_function, absolute_import import numpy as np from numba import vectorize from numba import cuda, int32, float32, float64 from timeit import default_timer as time from numba import unittest_support as unittest from numba.cuda.testing import skip_on_cudasim from numba.cuda.testing import CUDATestCase from numba import config sig = [int32(int32, int32), float32(float32, float32), float64(float64, float64)] target = "cuda" if config.ENABLE_CUDASIM: target = "cpu" test_dtypes = np.float32, np.int32 @skip_on_cudasim("ufunc API unsupported in the simulator") class TestCUDAVectorize(CUDATestCase): def test_scalar(self): @vectorize(sig, target=target) def vector_add(a, b): return a + b a = 1.2 b = 2.3 c = vector_add(a, b)
import numpy as np import numba @numba.vectorize([numba.int32(numba.uint8)], nopython=True) def hammard(n): """ fast uint8 Hammard Weight :param n: np.uint8 :return: np.uint8 """ # recursively divide in two, combinig sums by bit shifting and adding n = (n & np.uint8(85)) + ((n >> 1) & np.uint8(85)) # 85=01010101b n = (n & np.uint8(51)) + ((n >> 2) & np.uint8(51)) # 51=00110011b n = (n & np.uint8(15)) + ((n >> 4) & np.uint8(15)) # 15=00001111b return n @numba.vectorize([numba.int32(numba.int32)], nopython=True) def log2(n): """ fast integer floor log 2 :param n: input integer, must be positive :return: """ result = 0 for i in range(1, 32): if not n >> i: result = i - 1
def complete_grid(self): """Array of (edge,vertex,vertex) triples defining a complete graph.""" if self._complete_grid is None: self._complete_grid = make_complete_graph(self._num_vertices) return self._complete_grid @property def vertices(self): return self._vertices def gc(self): """Garbage collect temporary cached data structures.""" self._complete_grid = None @jit(numba.int32(numba.int32, numba.int32), nopython=True, cache=True) def find_complete_edge(v1, v2): """Find the edge index k of an unsorted pair of vertices (v1, v2).""" if v2 < v1: v1, v2 = v2, v1 return v1 + v2 * (v2 - 1) // 2 def make_complete_graph(num_vertices): """Constructs a complete graph. The pairing function is: k = v1 + v2 * (v2 - 1) // 2 Args: num_vertices: Number of vertices.
def cast_as_numba_type_attribute(): value = 4.4 return numba.int32(value)
import math import json import base64 import numpy as np from numba import njit, int32 # this is a 1-to-1 translation of our js bloom filters to python @njit(int32(int32)) def popcnt(v): v -= (v >> 1) & 0x55555555 v = (v & 0x33333333) + ((v >> 2) & 0x33333333) return ((v + (v >> 4) & 0xf0f0f0f) * 0x1010101) >> 24 # a * 16777619 mod 2**32 @njit(int32(int32)) def fnv_multiply(a): return a + (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) #// See https://web.archive.org/web/20131019013225/http://home.comcast.net/~bretm/hash/6.html @njit(int32(int32)) def fnv_mix(a): a += (a << 13) a ^= (a >> 7) a += (a << 3) a ^= (a >> 17)
from numba import float32, int32, jit import cProfile from dis import dis @jit(int32(int32, int32), nopython=True, nogil=True) def add_two(a, b): acc = 0 i = 0 while i < 1000: acc += a + b i += 1 return acc def add_two_wrap(a, b): return add_two(a, b) def add_two2(a, b): acc = 0 i = 0 while i < 1000: acc += a + b i += 1 return acc def test(): num = 100 print add_two_wrap(num, num + 1)
# i = 0 # # while True: # mother = np.random.randint(0,len(fit)) # father = np.random.randint(0,len(fit)) # r_rec = recombine_arrays(population[:, mother], population[:, father]) # new_pop[:, i] = r_rec[:, 0] # new_pop[:, i+1] = r_rec[:, 1] # i += 2 # if i >= n: # break # # return new_pop @jit(int32(float32[:]), nopython=True) def weighted_choice(weights): i = 0 rnd = np.random.random() * np.sum(weights) for i in range(len(weights)): rnd -= weights[i] if rnd < 0: break return i @jit(float32[:, :](float32[:, :], float32[:]), nopython=True) def recombine_population(population, fitness): weights = fitness.max() - fitness new_pop = np.zeros(population.shape, dtype=np.float32) n = population.shape[1]
def cu_angle_force(npa, pos, params, box0, box1, box2, box0_half, box1_half, box2_half, angle_size, angle_list, force, virial_potential, one_three, one_sixth): i = cuda.grid(1) if i < npa: pi = pos[i] # result = cuda.local.array(5, dtype = nb.float32) result0 = force[i][0] result1 = force[i][1] result2 = force[i][2] result3 = virial_potential[i][0] result4 = virial_potential[i][1] for ai in range(nb.int32(0), angle_size[i]): j = angle_list[i][ai][0] k = angle_list[i][ai][1] type = angle_list[i][ai][2] order = angle_list[i][ai][3] pj = pos[j] pk = pos[k] if order == 0: pa = pi pb = pj pc = pk if order == 1: pa = pj pb = pi pc = pk if order == 2: pa = pj pb = pk pc = pi d_ab0 = pa[0] - pb[0] d_ab1 = pa[1] - pb[1] d_ab2 = pa[2] - pb[2] d_cb0 = pc[0] - pb[0] d_cb1 = pc[1] - pb[1] d_cb2 = pc[2] - pb[2] if d_ab0 >= box0_half: d_ab0 -= box0 elif d_ab0 < -box0_half: d_ab0 += box0 if d_ab1 >= box1_half: d_ab1 -= box1 elif d_ab1 < -box1_half: d_ab1 += box1 if d_ab2 >= box2_half: d_ab2 -= box2 elif d_ab2 < -box2_half: d_ab2 += box2 if d_cb0 >= box0_half: d_cb0 -= box0 elif d_cb0 < -box0_half: d_cb0 += box0 if d_cb1 >= box1_half: d_cb1 -= box1 elif d_cb1 < -box1_half: d_cb1 += box1 if d_cb2 >= box2_half: d_cb2 -= box2 elif d_cb2 < -box2_half: d_cb2 += box2 rsq_ab = d_ab0 * d_ab0 + d_ab1 * d_ab1 + d_ab2 * d_ab2 r_ab = math.sqrt(rsq_ab) rsq_cb = d_cb0 * d_cb0 + d_cb1 * d_cb1 + d_cb2 * d_cb2 r_cb = math.sqrt(rsq_cb) cos_abc = d_ab0 * d_cb0 + d_ab1 * d_cb1 + d_ab2 * d_cb2 cos_abc /= r_ab * r_cb if cos_abc > nb.float32(1.0): cos_abc = nb.float32(1.0) if cos_abc < -nb.float32(1.0): cos_abc = -nb.float32(1.0) sin_abc = math.sqrt(nb.float32(1.0) - cos_abc * cos_abc) if sin_abc < minimum_value: sin_abc = minimum_value sin_abc = nb.float32(1.0) / sin_abc pms = params[type] fp = cuda.local.array(2, dtype=nb.float32) cu_func(cos_abc, sin_abc, pms, fp) a = -fp[0] * sin_abc a11 = a * cos_abc / rsq_ab a12 = -a / (r_ab * r_cb) a22 = a * cos_abc / rsq_cb fab0 = a11 * d_ab0 + a12 * d_cb0 fab1 = a11 * d_ab1 + a12 * d_cb1 fab2 = a11 * d_ab2 + a12 * d_cb2 fcb0 = a22 * d_cb0 + a12 * d_ab0 fcb1 = a22 * d_cb1 + a12 * d_ab1 fcb2 = a22 * d_cb2 + a12 * d_ab2 if order == 0: result0 += fab0 result1 += fab1 result2 += fab2 if order == 1: result0 -= fab0 + fcb0 result1 -= fab1 + fcb1 result2 -= fab2 + fcb2 if order == 2: result0 += fcb0 result1 += fcb1 result2 += fcb2 vx = d_ab0 * fab0 + d_cb0 * fcb0 vy = d_ab1 * fab1 + d_cb1 * fcb1 vz = d_ab2 * fab2 + d_cb2 * fcb2 virial = one_sixth * (vx + vy + vz) # if i==35 and ai == 0: # print(i, ai, vy, a, cos_abc, rsq_cb) potential = fp[1] * one_three result3 += virial result4 += potential force[i][0] = result0 force[i][1] = result1 force[i][2] = result2 virial_potential[i][0] = result3 virial_potential[i][1] = result4
if np.isnan(ai): f = True break return f @ndreduce([int64(int32), int64(int64), int64(float32), int64(float64)]) def count(a): non_missing = 0 for ai in a.flat: if not np.isnan(ai): non_missing += 1 return non_missing @ndreduce([int32(int32), int64(int64), float32(float32), float64(float64)]) def nansum(a): asum = 0 for ai in a.flat: if not np.isnan(ai): asum += ai return asum @ndreduce([float32(float32), float64(float64)]) def nanmean(a): asum = 0.0 count = 0 for ai in a.flat: if not np.isnan(ai): asum += ai
def dowork(M_f_start, nfs_sq, d_src_ar, d_dst_ar, weight_ar, sigma_m, E2, sigma_0, fovshift, nfs, W_cut, osd0p, osd1r): # Work out i_src and i_dst based on the 2-D thread index i_src, i_dst = cuda.grid(2) if i_src < nfs_sq and i_dst < nfs_sq: # Temporary shared memory for weights tmp_w = cuda.shared.array(12288, dtype=float32) myidx = (cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.x) offsidx = shifted_idx3(myidx) tmp_w[offsidx] = float32(0.0) tmp_w[offsidx + 1] = float32(0.0) tmp_w[offsidx + 2] = float32(0.0) cuda.syncthreads() # Compute the location of d_src_ar, this defines what sigma will # be. As r (as opp. to phi) increases, the sigma should increase. M_f = float32(nfs) / (E2 * math.log(((1 + d_src_ar[i_src, 1]) / (2 * E2)) + 1)) # Set some of M_f to 1 to ensure the fan-out starts at around the # edge of the foveal region. if (1 + d_src_ar[i_src, 1]) < fovshift: M_f = M_f_start # Compute modified sigma and 3 times this value. _sigma is a # function of r, aka d_src_ar[1]. M_f is the function of r. _sigma = (sigma_m / M_f) - (sigma_m / M_f_start) + sigma_0 three_sigma = float32(3.0) * _sigma # in-xy-plane distance (ignore d_src_ar[2]/dstdoc[2]) xd = (d_src_ar[i_src, 0] - d_dst_ar[i_dst, 0] + osd0p) yd = (d_src_ar[i_src, 1] - d_dst_ar[i_dst, 1] + osd1r) if abs(xd) < three_sigma and abs(yd) < three_sigma: dist = math.sqrt(math.pow(xd, 2) + math.pow(yd, 2)) gauss = math.exp(-0.5 * math.pow(dist / _sigma, 2)) if gauss > W_cut: # Write result into weight_ar tmp_w[offsidx] = float32(gauss) tmp_w[offsidx + 1] = float32(i_src) tmp_w[offsidx + 2] = float32(i_dst) # Sync threads, then access device memory with any results cuda.syncthreads() if cuda.threadIdx.x == 0 and cuda.threadIdx.y == 0: tpb = cuda.blockDim.x * cuda.blockDim.y # Write data from tmp_w to res_ar, but only in ONE thread from # the threadblock. Should avoid racing. for idx in range( 0, tpb): # 512 was hard coded here; changed it for tpb offsidx2 = shifted_idx3(idx) theweight = tmp_w[ offsidx2] # weight should be the first one, so no +1/+2 # Add to weight_ar weight_idx = int32(tmp_w[offsidx2 + 2]) * nfs_sq + int32( tmp_w[offsidx2 + 1]) weight_ar[weight_idx] = theweight return # end dowork()
def foo(): li, lf = List(), List() li.append(int32(1)) lf.append(float32(1.0)) return li._dtype, lf._dtype
def exprefixsum(masks, indices, init = 0, nelem = None): """ exclusive prefix sum """ nelem = masks.size if nelem is None else nelem carry = init for i in xrange(nelem): indices[i] = carry if masks[i] != 0: carry += masks[i] #indices[nelem] = carry return carry @numba.jit(int32(int32[:],int32[:],int32), nopython=False) def exprefixsumNumba(in_ary, out_ary, init = 0): """ exclusive prefix sum """ nelem = in_ary.size carry = init for i in range(nelem): out_ary[i] = carry carry += in_ary[i] return carry #@numba.jit(int32(int32[:],int32), nopython=False) @numba.njit
eq_t[h] += 1.0/nb_best_hand # impossible : error else: return -1 # normalize eq_w_agg and eq_t_agg for h in xrange(p): eq_agg[h, 0] = eq_w[h]/n eq_agg[h, 1] = eq_t[h]/n return eq_agg rank_fast = jit(int32(int32[:], int32[:], uint32[:], int32[:], int32, int32, int32[:], int32[:], int32[:]))(rank) exhaustive_block_fast = jit(int32[:](int32[:, :], int32[:], int32[:], uint32[:], int32[:], int32, int32, int32[:], int32[:], int32[:]))(exhaustive_block) def exhaustive_eval(player_card, table_card): """compute all possible games given the player/table cards (as a numbers from 0 to 51) and return equity win/tie for each player""" p = player_card.shape[0] equity_arr = np.zeros([p, 2], dtype=np.float32) print '\n---------------- Exhaustive eval start' print 'player_card=\n{}'.format(player_card) print 'table_card=\n{}'.format(table_card) print 'p={}'.format(p)
return np.array([x3d, y3d, z3d]) @numba.njit( numba.typeof( (0.0, 0.0))(numba.float32, numba.float32, numba.float32, numba.float32, numba.float32, numba.float32, numba.float32)) def project_to_2d(x, y, z, cx, cy, fx, fy): if z == 0: z = 0.001 x2d = fx * x / z + cx y2d = fy * y / z + cy return x2d, y2d @numba.njit(numba.int32(numba.float32, numba.float32, numba.float32)) def clip_round(value, minvalue, maxvalue): return int(round((min(max((value, minvalue)), maxvalue)))) @numba.njit(numba.float32[:, :](numba.float32[:, :], numba.float32[:, :], numba.float32[:, :], numba.float32[:, :], numba.float32[:], numba.typeof((0, 0)))) def reproject(d_image, rgb_mat, d_mat, R, T, shape): buffer = np.zeros(shape, np.float32) h, w = d_image.shape dfx = d_mat[0, 0] dfy = d_mat[1, 1] dcx = d_mat[0, 2] dcy = d_mat[1, 2]
import math @jit def f(x,y): # A somewhat trivial example return x + y ''' 在此模式下,编译将推迟到第一个函数执行。Numba将在调用时推断参数类型,并根据 此信息生成优化代码。Numba还可以根据输入类型编译单独的特化。例如,f()使用整数或复数调用上面的函数将生成不同的代码路径: ''' print(f(1, 2)) print(f(1j, 2)) from numba import jit, int32 @jit(int32(int32,int32)) def f(x, y): return x + y ''' int32(int32, int32)是函数的签名。在这种情况下,相应的特化将由@jit装饰器编译,并且不允许其他专门化。如果您希望对编译器 选择的类型进行细粒度控制(例如,使用单精度浮点数),这将非常有用。 如果省略返回类型,例如通过写而不是 ,Numba将尝试为您推断它。函数签名也可以是字符串,您可以将 其中的几个作为列表传递; ''' print(f(1, 2)) print(f(2**31, 2**31 + 1)) # 调用和内联其他函数 @jit
return np.interp(xp, [x1, x2], [y1, y2]) def complex_grid(xlim, ylim, nx, ny): ''' returns a nx x ny grid of complex numbers bounded by xlim and ylim ranges. ''' x = np.linspace(xlim[0], xlim[1], nx) y = np.linspace(ylim[0], ylim[1], ny) xx, yy = np.meshgrid(x, y) return xx + 1j*yy @jit(int32(complex128, complex128, int32, float64, int32), nopython=True, cache=True) def iterate(z, C, n, zmax, niter): ''' Return the number of iteration needed for the absolute value of z to become greater than zmax ''' zmax2 = zmax**2 for k in range(niter): z = pow(z, n) + C if z.imag**2 + z.real**2 > zmax2: break return k