def gencode_probability(pfunc, name): fun, jac = pfunc.gen_expr() return Template(r''' using ${name}_theta_t = ${theta_t}; struct ${name}_t : ${name}_theta_t { constexpr static int jac_dims = ${jac_dims}; template<class N> __device__ __inline__ auto operator() (N const &n) const { return ${expr}; } template<class N> __device__ __inline__ auto _j_a_c_o_b_i_a_n_(N const &n) const { graphdot::array<float, jac_dims> j; ${jac;\n}; return j; } }; __constant__ ${name}_t ${name}; ''').render( name=name, jac_dims=len(jac), theta_t=decltype(pfunc), expr=fun, jac=[f'j[{i}] = {expr}' for i, expr in enumerate(jac)] )
def __repr__(self): return Template('${cls}(${theta, }, ${bounds, })').render( cls=self.name, theta=[f'{n}={v}' for n, v in self._theta_values.items()], bounds=[f'{n}_bounds={v}' for n, v in self._theta_bounds.items()] )
def __init__(self, node_kernel, edge_kernel, **kwargs): self.node_kernel = node_kernel self.edge_kernel = edge_kernel self.template = Template(self._template) self.scratch = None self.scratch_capacity = 0 self.graph_cache = {} self.p = kwargs.pop('p', 'default') self.q = kwargs.pop('q', 0.01) self.block_per_sm = kwargs.pop('block_per_sm', 8) self.block_size = kwargs.pop('block_size', 128) self.device = pycuda.driver.Device(kwargs.pop('device', 0)) self.nvcc_extra = kwargs.pop('nvcc_extra', []) self.ctx = self.device.make_context()
def gen_expr(self, x, y, theta_scope=''): F, J = self.kernel.gen_expr( '_1', '_2', theta_scope + 'kernel.' ) f = Template( r'''normalize( [&](auto _1, auto _2){return ${f};}, ${x}, ${y} )''' ).render( x=x, y=y, f=F ) template = Template( r'''normalize_jacobian( [&](auto _1, auto _2){return ${f};}, [&](auto _1, auto _2){return ${j};}, ${x}, ${y} )''' ) jacobian = [template.render(x=x, y=y, f=F, j=j) for j in J] return f, jacobian
def decltype(type, name=''): type = np.dtype(type, align=True) # convert np.float32 etc. to dtype if type.names is not None: if len(type.names): return Template(r'struct{${members;};}${name}').render( name=name, members=[decltype(type.fields[v][0], v) for v in type.names]) else: return 'constexpr static numpy_type::_empty {} {{}}'.format(name) # elif type.subdtype is not None: # return Template(r'''${type} ${name}${dim}''').render( # type=type.name, name= # ) else: return '{} {}'.format(str(type.name), name)
def decltype(t, name=''): '''Generate C++ source code for structures corresponding to a `cpptype` class.''' t = np.dtype(t, align=True) # convert np.float32 etc. to dtype if name.startswith('$'): ''' template class types ''' n, t, *Ts = name[1:].split('::') _assert_is_identifier(n) return Template(r'${template}<${arguments,}>${name}').render( template=t, arguments=[decltype(T) for T in Ts], name=n ) else: _assert_is_identifier(name) if _dtype_util.is_object(t): ''' structs ''' if len(t.names): return Template(r'struct{${members;};}${name}').render( name=name, members=[decltype(t.fields[v][0], v) for v in t.names]) else: return f'constexpr static _empty {name} {{}}' elif _dtype_util.is_array(t): ''' C-style arrays ''' return Template(r'${t} ${name}[${shape][}]').render( t=decltype(t.base), name=name, shape=t.shape ) else: ''' scalar types and C-strings ''' if t.kind == 'S': return f'char {name}[{t.itemsize}]' else: return f'{str(t.name)} {name}'.strip()
def gencode_kernel(kernel, name): fun, jac = kernel.gen_expr('x1', 'x2') return Template(r''' using ${name}_theta_t = ${theta_t}; struct ${name}_t : ${name}_theta_t { constexpr static int jac_dims = ${jac_dims}; template<class X> __device__ __inline__ auto operator() (X const &x1, X const &x2) const { return ${expr}; } template<class X> __device__ __inline__ auto _j_a_c_o_b_i_a_n_(X const &x1, X const &x2) const { graphdot::array<float, jac_dims> j; ${jac;\n}; return j; } }; __constant__ ${name}_t ${name}; __constant__ ${name}_t ${name}_diff_grid[2 * ${n_theta}]; __constant__ float32 ${name}_flat_theta[${n_theta}]; ''').render( name=name, jac_dims=len(jac), theta_t=decltype(kernel), expr=fun, jac=[f'j[{i}] = {expr}' for i, expr in enumerate(jac)], n_theta=len(list(flatten(kernel.theta))) )
def _from_sympy(name, desc, expr, vars, *hyperparameter_specs, minmax=(0, 1)): '''Create a microkernel class from a SymPy expression. Parameters ---------- name: str The name of the microkernel. Must be a valid Python identifier. desc: str A human-readable description of the microkernel. Will be used to build the docstring of the returned microkernel class. expr: str or SymPy expression Expression of the microkernel in SymPy format. vars: 2-tuple of str or SymPy symbols The input variables of the microkernel as shown up in the expression. A microkernel must have exactly 2 input variables. All other symbols that show up in its expression should be regarded as hyperparameters. hyperparameter_specs: list of hyperparameter specifications in one of the formats below: | symbol, | (symbol,), | (symbol, dtype), | (symbol, dtype, description), | (symbol, dtype, lower_bound, upper_bound), | (symbol, dtype, lower_bound, upper_bound, description), If a default set of lower and upper bounds are not defined here, then it must be specified explicitly during microkernel object creation, using arguments as specified in the microkernel class's docstring. minmax: a 2-tuple of floats The minimum and maximum value that the kernel can output. ''' assert(isinstance(name, str) and name.isidentifier()) '''parse expression''' if isinstance(expr, str): expr = sy.sympify(expr) '''check input variables''' if len(vars) != 2: raise ValueError('A microkernel must have exactly two variables') vars = [sy.Symbol(v) if isinstance(v, str) else v for v in vars] '''parse the list of hyperparameters''' hyperdefs = OrderedDict() for spec in hyperparameter_specs: if not hasattr(spec, '__iter__'): symbol = spec hyperdefs[symbol] = dict(dtype=np.dtype(np.float32)) if len(spec) == 1: symbol = spec[0] hyperdefs[symbol] = dict(dtype=np.dtype(np.float32)) if len(spec) == 2: symbol, dtype = spec hyperdefs[symbol] = dict(dtype=np.dtype(dtype)) if len(spec) == 3: symbol, dtype, doc = spec hyperdefs[symbol] = dict(dtype=np.dtype(dtype), doc=doc) elif len(spec) == 4: symbol, dtype, lb, ub = spec hyperdefs[symbol] = dict(dtype=np.dtype(dtype), bounds=(lb, ub)) elif len(spec) == 5: symbol, dtype, lb, ub, doc = spec hyperdefs[symbol] = dict(dtype=np.dtype(dtype), bounds=(lb, ub), doc=doc) else: raise ValueError( 'Invalid hyperparameter specification, ' 'must be one of\n' '(symbol)\n', '(symbol, dtype)\n', '(symbol, dtype, doc)\n', '(symbol, dtype, lb, ub)\n', '(symbol, dtype, lb, ub, doc)\n', ) '''create microkernel class''' class CppType(type(MicroKernel)): @property def dtype(cls): return cls._dtype class uKernel(MicroKernel, metaclass=CppType): _expr = expr _vars = vars _hyperdefs = hyperdefs _dtype = np.dtype([(k, v['dtype']) for k, v in hyperdefs.items()], align=True) @property def name(self): return name def __init__(self, *args, **kwargs): self._theta_values = values = OrderedDict() self._theta_bounds = bounds = OrderedDict() for symbol, value in zip(self._hyperdefs, args): values[symbol] = value for symbol in self._hyperdefs: try: values[symbol] = kwargs[symbol] except KeyError: if symbol not in values: raise KeyError( f'Hyperparameter {symbol} not provided ' f'for {self.name}' ) try: bounds[symbol] = kwargs['%s_bounds' % symbol] except KeyError: try: bounds[symbol] = self._hyperdefs[symbol]['bounds'] except KeyError: raise KeyError( f'Bounds for hyperparameter {symbol} of ' f'microkernel {self.name} not set, and ' f'no defaults were given.' ) self._assert_bounds(symbol, bounds[symbol]) # @cached_property @property def _vars_and_hypers(self): if not hasattr(self, '_vars_and_hypers_cached'): self._vars_and_hypers_cached = [ *self._vars, *self._hyperdefs.keys() ] return self._vars_and_hypers_cached # @cached_property @property def _fun(self): if not hasattr(self, '_fun_cached'): self._fun_cached = lambdify( self._vars_and_hypers, self._expr ) return self._fun_cached # return lambdify(self._vars_and_hypers, self._expr) # @cached_property @property def _jac(self): if not hasattr(self, '_jac_cached'): self._jac_cached = [ lambdify(self._vars_and_hypers, sy.diff(expr, h)) for h in self._hyperdefs ] return self._jac_cached # return [lambdify(self._vars_and_hypers, sy.diff(expr, h)) # for h in self._hyperdefs] def __call__(self, x1, x2, jac=False): if jac is True: return ( self._fun(x1, x2, *self.theta), np.array([j(x1, x2, *self.theta) for j in self._jac]) ) else: return self._fun(x1, x2, *self.theta) def __repr__(self): return Template('${cls}(${theta, }, ${bounds, })').render( cls=self.name, theta=[f'{n}={v}' for n, v in self._theta_values.items()], bounds=[f'{n}_bounds={v}' for n, v in self._theta_bounds.items()] ) def gen_expr(self, x, y, theta_scope=''): nmap = { str(self._vars[0]): x, str(self._vars[1]): y, **{t: theta_scope + t for t in self._hyperdefs} } return ( cudacxxcode(self._expr, nmap), [cudacxxcode(sy.diff(self._expr, h), nmap) for h in self._hyperdefs] ) @property def dtype(self): return self._dtype @property def state(self): return tuple(self._theta_values.values()) @property def theta(self): return pretty_tuple( self.name, self._theta_values.keys() )(**self._theta_values) @theta.setter def theta(self, seq): assert(len(seq) == len(self._theta_values)) for theta, value in zip(self._hyperdefs, seq): self._theta_values[theta] = value @property def bounds(self): return tuple(self._theta_bounds.values()) @property def minmax(self): return minmax '''furnish doc strings''' param_docs = [ Template( '${name}: ${type}\n' ' ${desc\n }\n' '${name}_bounds: tuple or "fixed"\n' ' Lower and upper bounds of `${name}` with respect to ' 'hyperparameter optimization. If "fixed", the hyperparameter will ' 'not be optimized during training.' ).render( name=name, type=hdef['dtype'], desc=[s.strip() for s in hdef.get('doc', '').split('\n')] ) for name, hdef in hyperdefs.items() ] uKernel.__doc__ = Template( '${desc}\n' '\n' 'Parameters\n' '----------\n' '${param_docs\n}', escape=False ).render( desc='\n'.join([s.strip() for s in desc.split('\n')]), param_docs=param_docs ) return uKernel
def test_render_file(case): key1, key2, val, result = case assert (Template(os.path.join(_pwd, 'test_template.tpl')).render( key1=key1, key2=key2, val=val) == result)
def test_render_list(case): separators, repls = case for sep in separators: assert (Template(r'${key%s}' % sep).render(key=repls) == sep.join(repls))
def test_render_single(case): tpl, repl, result = case assert (Template(tpl).render(_1=repl) == result)
def template(self): return Template(os.path.join(os.path.dirname(__file__), '_backend.cu'))
def gencode(self, x, y): return Template('(${expr*})').render(expr=[ k.gencode('%s.%s' % (x, key), '%s.%s' % (y, key)) for key, k in self.kw_kernels.items() ])
def __repr__(self): return Template('TensorProduct(${kwexpr, })').render(kwexpr=[ '{}={}'.format(kw, repr(k)) for kw, k in self.kw_kernels.items() ])
class MarginalizedGraphKernel: """Implements the random walk-based graph similarity kernel as proposed in: Kashima, H., Tsuda, K., & Inokuchi, A. (2003). Marginalized kernels between labeled graphs. *In Proceedings of the 20th international conference on machine learning (ICML-03)* (pp. 321-328). Parameters ---------- node_kernel: base kernel or composition of base kernels A kernelet that computes the similarity between individual nodes edge_kernel: base kernel or composition of base kernels A kernelet that computes the similarity between individual edge kwargs: optional arguments p: functor or 'uniform' or 'default' The starting probability of the random walk on each node. Must be either a functor that takes in a node (a dataframe row) and returns a number, or the name of a built-in distribution. Currently, only 'uniform' and 'default' are implemented. Note that a custom probability does not have to be normalized. q: float in (0, 1) The probability for the random walk to stop during each step. block_per_sm: int Tunes the GPU kernel. block_size: int Tunes the GPU kernel. """ _template = os.path.join(os.path.dirname(__file__), 'template.cu') def __init__(self, node_kernel, edge_kernel, **kwargs): self.node_kernel = node_kernel self.edge_kernel = edge_kernel self.template = Template(self._template) self.scratch = None self.scratch_capacity = 0 self.graph_cache = {} self.p = kwargs.pop('p', 'default') self.q = kwargs.pop('q', 0.01) self.block_per_sm = kwargs.pop('block_per_sm', 8) self.block_size = kwargs.pop('block_size', 128) self.device = pycuda.driver.Device(kwargs.pop('device', 0)) self.nvcc_extra = kwargs.pop('nvcc_extra', []) self.ctx = self.device.make_context() def __del__(self): self.ctx.synchronize() self.ctx.detach() def _allocate_scratch(self, count, capacity): if (self.scratch is None or len(self.scratch) < count or self.scratch[0].capacity < capacity): self.ctx.synchronize() self.scratch = [BlockScratch(capacity) for _ in range(count)] self.scratch_d = umlike( np.array([s.state for s in self.scratch], BlockScratch.dtype)) self.scratch_capacity = self.scratch[0].capacity self.ctx.synchronize() def clone_with_theta(self): """scikit-learn compatibility method""" pass def _assert_homegeneous(self, X): for x1, x2 in zip(X[:-1], X[1:]): try: assert (x1.weighted == x2.weighted) assert (x1.node_type == x2.node_type) assert (x1.edge_type == x2.edge_type) except AssertionError as e: raise TypeError('All graphs must be of the same type: %s' % str(e)) def _convert_to_octilegraph(self, graph): if hasattr(graph, 'uuid') and graph.uuid in self.graph_cache: return self.graph_cache[graph.uuid] else: if not hasattr(graph, 'uuid'): graph.uuid = uuid.uuid4() og = OctileGraph(graph) self.graph_cache[graph.uuid] = og return og def _get_starting_probability(self, p): if isinstance(self.p, str): if self.p == 'uniform' or self.p == 'default': return lambda n: 1.0 else: raise ValueError( 'Unknown starting probability distribution %s' % self.p) else: return p def _launch_kernel(self, graphs, jobs, nodal, lmin): if lmin != 0 and lmin != 1: raise ValueError('lmin must be 0 or 1') ''' transfer grahs to GPU ''' oct_graphs = [self._convert_to_octilegraph(g) for g in graphs] self._assert_homegeneous(oct_graphs) oct_graphs_d = umlike( np.array([g.state for g in oct_graphs], OctileGraph.dtype)) ''' upload work item list to device ''' jobs_d = umlike(np.array([j.state for j in jobs], Job.dtype)) i_job_global = pycuda.gpuarray.zeros(1, np.uint32) ''' prepare GPU kernel launch ''' x = next(iter(oct_graphs)) weighted = x.weighted node_type = x.node_type edge_type = x.edge_type if weighted: edge_kernel = TensorProduct(weight=_Multiply(), label=self.edge_kernel) else: edge_kernel = self.edge_kernel node_kernel_src = Template(r''' struct node_kernel { template<class V> __device__ static auto compute(V const &v1, V const &v2) { return ${node_expr}; } }; ''').render(node_expr=self.node_kernel.gencode('v1', 'v2')) edge_kernel_src = Template(r''' struct edge_kernel { template<class T> __device__ static auto compute(T const &e1, T const &e2) { return ${edge_expr}; } }; ''').render(edge_expr=edge_kernel.gencode('e1', 'e2')) source = self.template.render(node_kernel=node_kernel_src, edge_kernel=edge_kernel_src, node_t=decltype(node_type), edge_t=decltype(edge_type)) self.source = source with warnings.catch_warnings(record=True) as w: mod = SourceModule(source, options=[ '-std=c++14', '-O4', '--use_fast_math', '--expt-relaxed-constexpr', '--maxrregcount=64', '-Xptxas', '-v', '-lineinfo', ] + self.nvcc_extra, no_extern_c=True, include_dirs=cpp.__path__) self.compiler_message = [str(rec.message) for rec in w] kernel = mod.get_function('graph_kernel_solver') launch_block_count = (self.device.MULTIPROCESSOR_COUNT * self.block_per_sm) shmem_bytes_per_warp = mod.get_global('shmem_bytes_per_warp')[1] shmem_bytes_per_block = (shmem_bytes_per_warp * self.block_size // self.device.WARP_SIZE) max_graph_size = np.max([g.padded_size for g in oct_graphs]) self._allocate_scratch(launch_block_count, max_graph_size**2) # print("%-32s : %ld" % ("Blocks launched", launch_block_count)) # print("%-32s : %ld" % ("Shared memory per block", # shmem_bytes_per_block)) kernel( oct_graphs_d.base, self.scratch_d.base, jobs_d.base, i_job_global, np.uint32(len(jobs)), np.float32(self.q), np.float32(self.q), # placeholder for q0 np.int32(lmin), grid=(launch_block_count, 1, 1), block=(self.block_size, 1, 1), shared=shmem_bytes_per_block) def __call__(self, X, Y=None, nodal=False, lmin=0): """Compute pairwise similarity matrix between graphs Parameters ---------- X: list of N graphs The graphs must all have same node and edge attributes. Y: None or list of M graphs The graphs must all have same node and edge attributes. nodal: bool If True, return node-wise similarities; otherwise, return graphwise similarities. lmin: 0 or 1 Number of steps to skip in each random walk path before similarity is computed. lmin + 1 corresponds to the starting value of l in the summation of Eq. 1 in Tang & de Jong, 2019 https://doi.org/10.1063/1.5078640 (or the first unnumbered equation as in Kashima, Tsuda, and Inokuchi, 2003). Returns ------- numpy.array if Y is None, return a square matrix containing pairwise similarities between the graphs in X; otherwise, returns a matrix containing similarities across graphs in X and Y. """ ''' generate jobs ''' if Y is None: jobs = [ Job(i, i + j, umarray(len(g1.nodes) * len(g2.nodes))) for i, g1 in enumerate(X) for j, g2 in enumerate(X[i:]) ] else: jobs = [ Job(i, len(X) + j, umarray(len(g1.nodes) * len(g2.nodes))) for i, g1 in enumerate(X) for j, g2 in enumerate(Y) ] ''' assign starting probabilities ''' p_func = self._get_starting_probability(self.p) P = [np.array([p_func(n) for n in g.nodes.iterrows()]) for g in X] if Y is not None: P += [np.array([p_func(n) for n in g.nodes.iterrows()]) for g in Y] ''' call GPU kernel ''' self._launch_kernel(X + Y if Y is not None else X, jobs, nodal, lmin) ''' collect result ''' if Y is None: N = len(X) R = np.empty((N, N), np.object) for job in jobs: r = job.vr.reshape(len(X[job.i].nodes), -1) pi = P[job.i] pj = P[job.j] if nodal is True: R[job.i, job.j] = pi[:, None] * r * pj[None, :] R[job.j, job.i] = R[job.i, job.j].T else: R[job.i, job.j] = R[job.j, job.i] = pi.dot(r).dot(pj) else: N = len(X) M = len(Y) R = np.empty((N, M), np.object) for job in jobs: r = job.vr.reshape(len(X[job.i].nodes), -1) pi = P[job.i] pj = P[job.j] if nodal is True: R[job.i, job.j - N] = pi[:, None] * r * pj[None, :] else: R[job.i, job.j - N] = pi.dot(r).dot(pj) return np.block(R.tolist()) def diag(self, X, nodal=False, lmin=0): """Compute the self-similarities for a list of graphs Parameters ---------- X: list of N graphs The graphs must all have same node attributes and edge attributes. nodal: bool or 'block' If True, returns a vector containing nodal self similarties; if False, returns a vector containing graphs' overall self similarities; if 'block', return a list of square matrices, each being a pairwise nodal similarity matrix within a graph. lmin: 0 or 1 Number of steps to skip in each random walk path before similarity is computed. lmin + 1 corresponds to the starting value of l in the summation of Eq. 1 in Tang & de Jong, 2019 https://doi.org/10.1063/1.5078640 (or the first unnumbered equation as in Kashima, Tsuda, and Inokuchi, 2003). Returns ------- numpy.array or list of np.array(s) If nodal=True, returns a vector containing nodal self similarties; if nodal=False, returns a vector containing graphs' overall self similarities; if nodal = 'block', return a list of square matrices, each being a pairwise nodal similarity matrix within a graph. """ ''' generate jobs ''' jobs = [Job(i, i, umarray(len(g1.nodes)**2)) for i, g1 in enumerate(X)] ''' assign starting probabilities ''' p_func = self._get_starting_probability(self.p) P = [np.array([p_func(n) for n in g.nodes.iterrows()]) for g in X] ''' call GPU kernel ''' self._launch_kernel(X, jobs, nodal, lmin) ''' collect result ''' N = [len(x.nodes) for x in X] if nodal is True: return np.concatenate( [p**2 * job.vr[::n + 1] for job, p, n in zip(jobs, P, N)]) elif nodal is False: return np.array([ p.dot(job.vr.reshape(n, -1)).dot(p) for job, p, n in zip(jobs, P, N) ]) elif nodal == 'block': return list(p[:, None] * job.vr.reshape(n, -1) * p[None, :] for job, p, n in zip(jobs, P, N)) else: raise (ValueError("Invalid 'nodal' option '%s'" % nodal))
def _launch_kernel(self, graphs, jobs, nodal, lmin): if lmin != 0 and lmin != 1: raise ValueError('lmin must be 0 or 1') ''' transfer grahs to GPU ''' oct_graphs = [self._convert_to_octilegraph(g) for g in graphs] self._assert_homegeneous(oct_graphs) oct_graphs_d = umlike( np.array([g.state for g in oct_graphs], OctileGraph.dtype)) ''' upload work item list to device ''' jobs_d = umlike(np.array([j.state for j in jobs], Job.dtype)) i_job_global = pycuda.gpuarray.zeros(1, np.uint32) ''' prepare GPU kernel launch ''' x = next(iter(oct_graphs)) weighted = x.weighted node_type = x.node_type edge_type = x.edge_type if weighted: edge_kernel = TensorProduct(weight=_Multiply(), label=self.edge_kernel) else: edge_kernel = self.edge_kernel node_kernel_src = Template(r''' struct node_kernel { template<class V> __device__ static auto compute(V const &v1, V const &v2) { return ${node_expr}; } }; ''').render(node_expr=self.node_kernel.gencode('v1', 'v2')) edge_kernel_src = Template(r''' struct edge_kernel { template<class T> __device__ static auto compute(T const &e1, T const &e2) { return ${edge_expr}; } }; ''').render(edge_expr=edge_kernel.gencode('e1', 'e2')) source = self.template.render(node_kernel=node_kernel_src, edge_kernel=edge_kernel_src, node_t=decltype(node_type), edge_t=decltype(edge_type)) self.source = source with warnings.catch_warnings(record=True) as w: mod = SourceModule(source, options=[ '-std=c++14', '-O4', '--use_fast_math', '--expt-relaxed-constexpr', '--maxrregcount=64', '-Xptxas', '-v', '-lineinfo', ] + self.nvcc_extra, no_extern_c=True, include_dirs=cpp.__path__) self.compiler_message = [str(rec.message) for rec in w] kernel = mod.get_function('graph_kernel_solver') launch_block_count = (self.device.MULTIPROCESSOR_COUNT * self.block_per_sm) shmem_bytes_per_warp = mod.get_global('shmem_bytes_per_warp')[1] shmem_bytes_per_block = (shmem_bytes_per_warp * self.block_size // self.device.WARP_SIZE) max_graph_size = np.max([g.padded_size for g in oct_graphs]) self._allocate_scratch(launch_block_count, max_graph_size**2) # print("%-32s : %ld" % ("Blocks launched", launch_block_count)) # print("%-32s : %ld" % ("Shared memory per block", # shmem_bytes_per_block)) kernel( oct_graphs_d.base, self.scratch_d.base, jobs_d.base, i_job_global, np.uint32(len(jobs)), np.float32(self.q), np.float32(self.q), # placeholder for q0 np.int32(lmin), grid=(launch_block_count, 1, 1), block=(self.block_size, 1, 1), shared=shmem_bytes_per_block)