def set_numba_threads(n): numba_threads = numba.get_num_threads() try: numba.set_num_threads(n) yield finally: numba.set_num_threads(numba_threads)
def _test_func(acc, buf, local_mask): set_num_threads(nthreads) # set threads in parent function set_num_threads(local_mask[0]) if local_mask[0] < N: child_func(buf, local_mask[0]) acc[0] += get_num_threads()
def test_func(nthreads): x = 5 buf = np.empty((x, )) set_num_threads(nthreads) for i in prange(x): buf[i] = get_num_threads() return buf
def test_func(): set_num_threads(mask) x = 5000000 buf = np.empty((x, )) for i in prange(x): buf[i] = get_thread_id() return len(np.unique(buf)), get_num_threads()
def _transform(self, X, y=None): """Transform input time series. Parameters ---------- X : 3D np.ndarray of shape = [n_instances, n_dimensions, series_length] panel of time series to transform y : ignored argument for interface compatibility Returns ------- pandas DataFrame, transformed features """ X = X[:, 0, :].astype(np.float32) # change n_jobs dependend on value and existing cores prev_threads = get_num_threads() if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count(): n_jobs = multiprocessing.cpu_count() else: n_jobs = self.n_jobs set_num_threads(n_jobs) X_ = _transform(X, self.parameters) set_num_threads(prev_threads) return pd.DataFrame(X_)
def _test_func(buf, local_mask): set_num_threads(nthreads) # when the threads exit the child functions they should # have a TLS slot value of the local mask as it was set # in child if local_mask[0] < config.NUMBA_NUM_THREADS: child(buf, local_mask[0]) assert get_num_threads() == local_mask[0]
def test_numba_info(): ni = d.numba_info() if numba.config.DISABLE_JIT: assert ni is None else: assert ni is not None assert ni.threading == numba.threading_layer() assert ni.threads == numba.get_num_threads()
def nn_descent_internal_high_memory_parallel( current_graph, inds, indptr, data, n_neighbors, rng_state, max_candidates=50, dist=sparse_euclidean, n_iters=10, delta=0.001, verbose=False, ): n_vertices = indptr.shape[0] - 1 block_size = 16384 n_blocks = n_vertices // block_size n_threads = numba.get_num_threads() in_graph = [ set(current_graph[0][i].astype(np.int64)) for i in range(current_graph[0].shape[0]) ] for n in range(n_iters): if verbose: print("\t", n + 1, " / ", n_iters) (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates( current_graph, max_candidates, rng_state, n_threads ) c = 0 for i in range(n_blocks + 1): block_start = i * block_size block_end = min(n_vertices, (i + 1) * block_size) new_candidate_block = new_candidate_neighbors[block_start:block_end] old_candidate_block = old_candidate_neighbors[block_start:block_end] dist_thresholds = current_graph[1][:, 0] updates = generate_graph_updates( new_candidate_block, old_candidate_block, dist_thresholds, inds, indptr, data, dist, ) c += apply_graph_updates_high_memory(current_graph, updates, in_graph) if c <= delta * n_neighbors * n_vertices: if verbose: print("\tStopping threshold met -- exiting after", n + 1, "iterations") return
def filter_genes(adata, minc): t0 = time.time() cols_to_keep = 0 cols = 0 if use_fastpp: @numba.njit(cache=True, parallel=True) def get_cols_to_keep(indices, data, minc, colcount, nthr): counts = np.zeros((nthr, colcount), dtype=np.int32) for i in numba.prange(nthr): start = i * indices.shape[0] // nthr end = (i + 1) * indices.shape[0] // nthr for j in range(start, end): if data[j] != 0 and indices[j] < colcount: #if indices[j]<colcount: counts[i, indices[j]] += 1 counts = np.sum(counts, axis=0) keep_cols = counts >= minc return counts, keep_cols ncols = adata.X.shape[1] nthr = numba.get_num_threads() print("filter_genes: prep ", time.time() - t0) counts, cols_to_keep = get_cols_to_keep(adata.X.indices, adata.X.data, minc, ncols, nthr) print("filter_genes: compute ", time.time() - t0) adata.var['n_cells'] = counts print("filter_genes: set metadata ", time.time() - t0) if strategy == 1: adata = adata[:, cols_to_keep] if strategy == 2: adata._inplace_subset_var(cols_to_keep) if strategy == 3: adata = adata[:, cols_to_keep].copy() if strategy == 4: adata = anndata.AnnData(adata.X[:, cols_to_keep], adata.obs, adata.var.iloc[cols_to_keep, :]) if strategy == 5: adata = anndata.AnnData( adata.X[:, cols_to_keep], adata.obs, adata.var.drop(adata.var.iloc[np.logical_not(cols_to_keep), :], inplace=True)) if strategy == 6: adata._inplace_subset_var(cols_to_keep) if strategy == 7: adata = anndata.AnnData(csr_subset(adata.X, None, cols_to_keep), adata.obs, adata.var.iloc[cols_to_keep, :]) if strategy == 8: cols = csr_col_subset(adata.X, cols_to_keep) if strategy == 9: adata._inplace_subset_var(cols_to_keep) if strategy == 10: adata = anndata.AnnData(adata.X[:, cols_to_keep], adata.obs, adata.var.iloc[cols_to_keep, :]) if strategy == 11: cols = csr_col_subset(adata.X, cols_to_keep) else: sc.pp.filter_genes(adata, min_cells=minc) print("filter_genes: filter total", time.time() - t0) if cols == 0: cols = adata.shape[1] return adata, cols_to_keep, cols
def __call__(self, *, n_steps, mu_coeff, post_step, post_iter, fields): assert self.n_threads == 1 or numba.get_num_threads() == self.n_threads with warnings.catch_warnings(): warnings.simplefilter('ignore', category=NumbaExperimentalFeatureWarning) wall_time_per_timestep = self.__call( n_steps, mu_coeff, post_step, post_iter, *(_Impl(field=v.impl[IMPL_META_AND_DATA], bc=v.impl[IMPL_BC]) for v in fields.values()), self.traversals.null_impl) return wall_time_per_timestep
def klip_chunk_svd(image_vecs_meansub, n_images, mtx_u0, diag_s0, mtx_v0, k_klip, reuse, strategy, exclusion_values, exclusion_deltas, signal_vecs): n_frames = image_vecs_meansub.shape[1] output = np.zeros_like(image_vecs_meansub) if signal_vecs is not None: output_model = np.zeros_like(signal_vecs) else: output_model = None print('klip_chunk_svd running with', numba.get_num_threads(), 'threads on', n_frames, 'frames') for i in numba.prange(n_frames): if not reuse: min_excluded_idx, max_excluded_idx = exclusions_to_range( n_images=n_images, current_idx=i, exclusion_values=exclusion_values, exclusion_deltas=exclusion_deltas, ) n_excluded = max_excluded_idx - min_excluded_idx + 1 print('processing frame', i, ', excluding', n_excluded, ' frames (from frame', min_excluded_idx, 'to', max_excluded_idx, ")") if strategy == constants.KlipStrategy.DOWNDATE_SVD: assert mtx_u0 is not None assert diag_s0 is not None assert mtx_v0 is not None subset_mtx_u0 = np.ascontiguousarray(mtx_u0[:, :k_klip + n_excluded]) subset_diag_s = diag_s0[:k_klip + n_excluded] subset_mtx_v0 = np.ascontiguousarray(mtx_v0[:, :k_klip + n_excluded]) new_u, _, _ = learning.minimal_downdate( subset_mtx_u0, subset_diag_s, subset_mtx_v0, min_col_to_remove=min_excluded_idx, max_col_to_remove=max_excluded_idx + 1, ) eigenimages = new_u[:, :k_klip] else: subset_image_vecs = utils.drop_idx_range_cols( image_vecs_meansub, min_excluded_idx, max_excluded_idx + 1) eigenimages, _, _ = learning._numba_svd_wrap( subset_image_vecs, k_klip) else: assert mtx_u0 is not None eigenimages = mtx_u0[:, :k_klip] meansub_target = image_vecs_meansub[:, i] # Since we may have truncated by columns above, this re-contiguou-fies # and silences the NumbaPerformanceWarning eigenimages = np.ascontiguousarray(eigenimages) output[:, i] = meansub_target - eigenimages @ ( eigenimages.T @ meansub_target) return output, output_model
def condensation(solver, n_cell, cell_start_arg, v, particle_temperatures, r_cr, n, vdry, idx, rhod, thd, qv, dv, prhod, pthd, pqv, kappa, rtol_x, rtol_thd, dt, substeps, cell_order, ripening_flags): n_threads = min(numba.get_num_threads(), n_cell) AlgorithmicMethods._condensation( solver, n_threads, n_cell, cell_start_arg.data, v.data, particle_temperatures.data, r_cr.data, n.data, vdry.data, idx.data, rhod.data, thd.data, qv.data, dv, prhod.data, pthd.data, pqv.data, kappa, rtol_x, rtol_thd, dt, substeps.data, cell_order, ripening_flags.data)
def get_num_threads(): """ Get current number of threads. Returns ------- int Number of threads. """ return numba.get_num_threads()
def condensation(solver, n_cell, cell_start_arg, v, v_cr, n, vdry, idx, rhod, thd, qv, dv, prhod, pthd, pqv, kappa, rtol_x, rtol_thd, dt, counters, cell_order, RH_max, success): n_threads = min(numba.get_num_threads(), n_cell) AlgorithmicMethods._condensation( solver, n_threads, n_cell, cell_start_arg.data, v.data, v_cr.data, n.data, vdry.data, idx.data, rhod.data, thd.data, qv.data, dv, prhod.data, pthd.data, pqv.data, kappa, rtol_x, rtol_thd, dt, counters['n_substeps'].data, counters['n_activating'].data, counters['n_deactivating'].data, counters['n_ripening'].data, cell_order, RH_max.data, success.data)
def test_func(nthreads): buf = np.zeros((M, N)) set_num_threads(nthreads) for i in prange(M): local_mask = 1 + i % mask # when the threads exit the child functions they should # have a TLS slot value of the local mask as it was set # in child if local_mask < config.NUMBA_NUM_THREADS: child(buf, local_mask) assert get_num_threads() == local_mask return buf
def _test_func(nthreads): acc = 0 buf = np.zeros((M, N)) set_num_threads(nthreads) for i in prange(M): local_mask = 1 + i % mask # set threads in parent function set_num_threads(local_mask) if local_mask < N: child_func(buf, local_mask) acc += get_num_threads() return acc, buf
def __call__(self, nt, mu_coeff, advectee, advectee_bc, advector, advector_bc, g_factor, g_factor_bc, vectmp_a, vectmp_a_bc, vectmp_b, vectmp_b_bc, vectmp_c, vectmp_c_bc, psi_min, psi_min_bc, psi_max, psi_max_bc, beta_up, beta_up_bc, beta_down, beta_down_bc): assert self.n_threads == 1 or numba.get_num_threads() == self.n_threads return self.__call(nt, mu_coeff, advectee, advectee_bc, advector, advector_bc, g_factor, g_factor_bc, vectmp_a, vectmp_a_bc, vectmp_b, vectmp_b_bc, vectmp_c, vectmp_c_bc, psi_min, psi_min_bc, psi_max, psi_max_bc, beta_up, beta_up_bc, beta_down, beta_down_bc)
def numba_info(): x = _par_test(100) _log.debug('sum: %d', x) try: layer = numba.threading_layer() except ValueError: _log.info('Numba threading not initialized') return None _log.info('numba threading layer: %s', layer) nth = numba.get_num_threads() return NumbaInfo(layer, nth)
def filter_cells(adata, ming, maxg): t0 = time.time() rows_to_keep = 0 rows = 0 if use_fastpp: @numba.njit(cache=True, parallel=True) def get_rows_to_keep(indptr, ming, maxg): lens = indptr[1:] - indptr[:-1] keep_rows = np.logical_and(lens >= ming, lens <= maxg) return lens, keep_rows nrows = adata.X.shape[0] nelems = adata.X.data.shape[0] nthr = numba.get_num_threads() print("filter_cells: prep ", time.time() - t0) row_lengths, rows_to_keep = get_rows_to_keep(adata.X.indptr, ming, maxg) print("filter_cells: compute ", time.time() - t0) adata.obs['n_genes'] = row_lengths print("filter_cells: set metadata ", time.time() - t0) if strategy == 1: adata = adata[rows_to_keep] if strategy == 2: adata._inplace_subset_obs(rows_to_keep) if strategy == 3: adata = adata[rows_to_keep].copy() if strategy == 4: adata = anndata.AnnData(adata.X[rows_to_keep], adata.obs.iloc[rows_to_keep, :], adata.var) if strategy == 5: adata = anndata.AnnData( adata.X[rows_to_keep], adata.obs.drop(adata.obs.iloc[np.logical_not(rows_to_keep), :], inplace=True), adata.var) if strategy == 6: adata._inplace_subset_obs(rows_to_keep) if strategy == 7: adata = anndata.AnnData(csr_subset(adata.X, rows_to_keep), adata.obs.iloc[rows_to_keep, :], adata.var) if strategy == 8: rows = csr_row_subset(adata.X, rows_to_keep) if strategy == 9: adata._inplace_subset_obs(rows_to_keep) if strategy == 10: adata = anndata.AnnData(adata.X[rows_to_keep], adata.obs.iloc[rows_to_keep, :], adata.var) if strategy == 11: rows = csr_row_subset2(adata.X, rows_to_keep) #if strategy==11: adata = anndata.AnnData(adata.X[rows_to_keep],adata.obs.iloc[rows_to_keep,:],adata.var) else: sc.pp.filter_cells(adata, min_genes=ming) print("filter_cells: first call ", time.time() - t0) sc.pp.filter_cells(adata, max_genes=maxg) print("filter_cells: filter total", time.time() - t0) if rows == 0: rows = adata.shape[0] return adata, rows_to_keep, rows
def set_threads(num: int = -1) -> int: """Set the number of numba threads Args: num (int, optional): The number of threads. Defaults to -1. Returns: int: The old number of theads (or -1 if unchanged). """ if num > 0: old = get_num_threads() if old != num: set_num_threads(num) return old return -1
def __init__( self, minibatch: int, maxT: int, maxU: int, alphabet_size: int, workspace, blank: int, fastemit_lambda: float, clamp: float, num_threads: int, stream, ): """ Helper class to launch the CUDA Kernels to compute the Transducer Loss. Args: minibatch: Int representing the batch size. maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). workspace: An allocated chunk of memory that will be sliced off and reshaped into required blocks used as working memory. blank: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. num_threads: Number of OMP threads to launch. stream: Numba Cuda Stream. """ self.minibatch_ = minibatch self.maxT_ = maxT self.maxU_ = maxU self.alphabet_size_ = alphabet_size self.gpu_workspace = cuda.as_cuda_array( workspace ) # a flat vector of floatX numbers that represents allocated memory slices self.blank_ = blank self.fastemit_lambda_ = fastemit_lambda self.clamp_ = abs(clamp) self.num_threads_ = num_threads self.stream_ = stream # type: cuda.cudadrv.driver.Stream if num_threads > 0: numba.set_num_threads(min(multiprocessing.cpu_count(), num_threads)) else: self.num_threads_ = numba.get_num_threads()
def set_numba_threads(n): import numba from numba.core.config import NUMBA_NUM_THREADS numba_threads = numba.get_num_threads() try: if n > NUMBA_NUM_THREADS: warnings.warn( f"Attempting to set threads to {n}, which is larger than " f"NUMBA_NUM_THREADS={NUMBA_NUM_THREADS}. " f"Setting to allowed maximum NUMBA_NUM_THREADS instead.") n = min(n, NUMBA_NUM_THREADS) numba.set_num_threads(n) yield finally: numba.set_num_threads(numba_threads)
def _transform(self, X, y=None): """Transform input time series using random convolutional kernels. Parameters ---------- X : 3D np.ndarray of shape = [n_instances, n_dimensions, series_length] panel of time series to transform y : ignored argument for interface compatibility Returns ------- pandas DataFrame, transformed features """ X = X.astype(np.float64) X = convert(X, from_type="numpy3D", to_type="numpyflat", as_scitype="Panel") if self.normalise: X = (X - X.mean(axis=-1, keepdims=True)) / ( X.std(axis=-1, keepdims=True) + 1e-8) X1 = np.diff(X, 1) # change n_jobs dependend on value and existing cores prev_threads = get_num_threads() if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count(): n_jobs = multiprocessing.cpu_count() else: n_jobs = self.n_jobs set_num_threads(n_jobs) X = _transform( X, X1, self.parameter, self.parameter1, self.n_features_per_kernel, ) X = np.nan_to_num(X) set_num_threads(prev_threads) # # from_2d_array_to_3d_numpy # _X = np.reshape(_X, (_X.shape[0], 1, _X.shape[1])).astype(np.float64) return pd.DataFrame(X)
def transform(self, X, y=None): """Transform input time series using random convolutional kernels. Parameters ---------- X : pandas DataFrame, input time series (sktime format) y : array_like, target values (optional, ignored as irrelevant) Returns ------- pandas DataFrame, transformed features """ self.check_is_fitted() _X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) _X = _X[:, 0, :].astype(np.float64) _X = from_3d_numpy_to_2d_array(_X) if self.normalise: _X = (_X - _X.mean(axis=-1, keepdims=True)) / ( _X.std(axis=-1, keepdims=True) + 1e-8) X1 = np.diff(_X, 1) # change n_jobs dependend on value and existing cores prev_threads = get_num_threads() if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count(): n_jobs = multiprocessing.cpu_count() else: n_jobs = self.n_jobs set_num_threads(n_jobs) _X = _transform( _X, X1, self.parameter, self.parameter1, self.n_features_per_kernel, ) _X = np.nan_to_num(_X) set_num_threads(prev_threads) # # from_2d_array_to_3d_numpy # _X = np.reshape(_X, (_X.shape[0], 1, _X.shape[1])).astype(np.float64) return pd.DataFrame(_X)
def __init__( self, minibatch: int, maxT: int, maxU: int, alphabet_size: int, workspace: torch.Tensor, blank: int, fastemit_lambda: float, num_threads: int, batch_first: bool, ): """ Helper class to compute the Transducer Loss on CPU. Args: minibatch: Size of the minibatch b. maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). workspace: An allocated chunk of memory that will be sliced off and reshaped into required blocks used as working memory. blank: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. num_threads: Number of OMP threads to launch. batch_first: Bool that decides if batch dimension is first or third. """ self.minibatch_ = minibatch self.maxT_ = maxT self.maxU_ = maxU self.alphabet_size_ = alphabet_size self.workspace = workspace # a flat vector of floatX numbers that represents allocated memory slices self.blank_ = blank self.fastemit_lambda_ = fastemit_lambda self.num_threads_ = num_threads self.batch_first = batch_first if num_threads > 0: numba.set_num_threads(min(multiprocessing.cpu_count(), num_threads)) else: self.num_threads_ = numba.get_num_threads()
def VelocityStructFunc_tree(pos, vel, weight, tree, rbins, max_bin_size_ratio=100, theta=0.7, boxsize=0, weighted_binning=False): """Returns the average mass in radial bins surrounding a point Arguments: pos -- shape (N,3) array of particle positions tree -- Octree instance containing the positions, masses, and softenings of the source particles Optional arguments: rbins -- 1D array of radial bin edges - if None will use heuristics to determine sensible bins max_bin_size_ratio -- controls the accuracy of the binning - tree nodes are subdivided until their side length is at most this factor * the radial bin width (default 0.5) Returns: mbins -- arrays containing total mass in each bin """ Nthreads = get_num_threads() mbin = zeros((Nthreads, rbins.shape[0] - 1)) wtsum = zeros_like(mbin) # break into chunks for parallelization for chunk in prange(Nthreads): for i in range(chunk, pos.shape[0], Nthreads): dwtsum, dmbin = VelocityStructWalk( pos[i], vel[i], tree, rbins, max_bin_size_ratio=max_bin_size_ratio, theta=theta, boxsize=boxsize, weighted_binning=weighted_binning) for j in range(mbin.shape[1]): mbin[chunk, j] += dmbin[j] * weight[i] wtsum[chunk, j] += weight[i] * dwtsum[j] return mbin.sum(0) / wtsum.sum(0)
def apply_graph_updates_low_memory(current_graph, updates): n_changes = 0 priorities = current_graph[1] indices = current_graph[0] flags = current_graph[2] n_threads = numba.get_num_threads() for n in numba.prange(n_threads): for i in range(len(updates)): for j in range(len(updates[i])): p, q, d = updates[i][j] if p == -1 or q == -1: continue if p % n_threads == n: # added = heap_push(current_graph, p, d, q, 1) added = checked_flagged_heap_push( priorities[p], indices[p], flags[p], d, q, 1, ) n_changes += added if q % n_threads == n: # added = heap_push(current_graph, q, d, p, 1) added = checked_flagged_heap_push( priorities[q], indices[q], flags[q], d, p, 1, ) n_changes += added return n_changes
def _transform(self, X, y=None): """Transform input time series using random convolutional kernels. Parameters ---------- X : 3D np.ndarray of shape = [n_instances, n_dimensions, series_length] panel of time series to transform y : ignored argument for interface compatibility Returns ------- pandas DataFrame, transformed features """ if self.normalise: X = (X - X.mean(axis=-1, keepdims=True)) / ( X.std(axis=-1, keepdims=True) + 1e-8 ) _X1 = np.diff(X, 1) # change n_jobs dependend on value and existing cores prev_threads = get_num_threads() if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count(): n_jobs = multiprocessing.cpu_count() else: n_jobs = self.n_jobs set_num_threads(n_jobs) X = _transform( X, _X1, self.parameter, self.parameter1, self.n_features_per_kernel, ) X = np.nan_to_num(X) set_num_threads(prev_threads) return pd.DataFrame(X)
def sum(X, axis=None): @numba.njit(cache=True, parallel=True) def _sum(X): s = 0 for i in numba.pndindex(X.shape): s += X[i] return s @numba.njit(cache=True, parallel=True) def _sum0(X, nthr): s = np.empty((nthr, X.shape[1]), dtype=X.dtype) for i in numba.prange(nthr): for r in range(i, X.shape[0], nthr): s[i] = X[r] return s.sum(axis=0) @numba.njit(cache=True, parallel=True) def _sum1(X): s = np.empty(X.shape[0], dtype=X.dtype) for r in numba.prange(X.shape[0]): s[r] = X[r].sum() return s if issparse(X) or not use_fastpp: if axis is None: return np.array(X.sum()) return np.array(X.sum(axis=axis)) if axis is None: return _sum(X) if X.ndim == 2: if axis == 0: nthr = numba.get_num_threads() return _sum0(X, nthr) return _sum1(X) # if axis is None: # return X.sum() return X.sum(axis=axis)
def __init__(self, *, options: Options, n_dims: (int, None) = None, non_unit_g_factor: bool = False, grid: (tuple, None) = None, n_threads: (int, None) = None): if n_dims is not None and grid is not None: raise ValueError() if n_dims is None and grid is None: raise ValueError() if grid is None: grid = tuple([-1] * n_dims) if n_dims is None: n_dims = len(grid) if n_dims > 1 and options.DPDC: raise NotImplementedError() if n_threads is None: n_threads = numba.get_num_threads() self.__options = options self.__n_threads = 1 if n_dims == 1 else n_threads if self.__n_threads > 1: try: numba.parfors.parfor.ensure_parallel_support() except numba.core.errors.UnsupportedParforsError: print( "Numba ensure_parallel_support() failed, forcing n_threads=1", file=sys.stderr) self.__n_threads = 1 self.__n_dims = n_dims self.__call, self.traversals = make_step_impl(options, non_unit_g_factor, grid, self.n_threads)