def shfl_down_i32(mask, val, offset): # Here we use 31 as the last argument since 32 (warp size) does not work # for some reason. Using 31 leads to the desired behavior. return expr.Expr( _ti_core.insert_internal_func_call( "cuda_shfl_down_sync_i32", expr.make_expr_group(mask, val, offset, 31), False))
def shfl_up_f32(mask, val, offset): return expr.Expr( _ti_core.insert_internal_func_call( "cuda_shfl_up_sync_f32", # lane offset is 0 for warp size 32 expr.make_expr_group(mask, val, offset, 0), False))
def shfl_sync_i32(mask, val, offset): return expr.Expr( _ti_core.insert_internal_func_call( # lane offset is 31 for warp size 32 "cuda_shfl_sync_i32", expr.make_expr_group(mask, val, offset, 31), False))
def call_internal(name, *args): return expr_init( _ti_core.insert_internal_func_call(name, make_expr_group(args)))
def shfl_up_f32(mask, val, offset): return expr.Expr( _ti_core.insert_internal_func_call( "cuda_shfl_up_sync_f32", expr.make_expr_group(mask, val, offset, 32), False))
def ballot(predicate): return expr.Expr( _ti_core.insert_internal_func_call("cuda_ballot_i32", expr.make_expr_group(predicate), False))
def reduce_xor(value): return expr.Expr(_ti_core.insert_internal_func_call( "subgroupXor", expr.make_expr_group(value), False), dtype=value.ptr.get_ret_type())
def invocation_id(): return expr.Expr(_ti_core.insert_internal_func_call( "subgroupInvocationId", expr.make_expr_group(), False), dtype=i32)
def group_size(): return expr.Expr(_ti_core.insert_internal_func_call( "subgroupSize", expr.make_expr_group(), False), dtype=i32)
def barrier(): return expr.Expr( _ti_core.insert_internal_func_call("subgroupBarrier", expr.make_expr_group(), False))
def broadcast(value, index: i32): return expr.Expr( _ti_core.insert_internal_func_call("subgroupBroadcast", expr.make_expr_group(value, index), False))
def elect(): return expr.Expr( _ti_core.insert_internal_func_call("subgroupElect", expr.make_expr_group(), False))
def inclusive_or(value): return expr.Expr(_ti_core.insert_internal_func_call( "subgroupInclusiveOr", expr.make_expr_group(value), False), dtype=value.ptr.get_ret_type())
def call_internal(name, *args, with_runtime_context=True): return expr_init( _ti_core.insert_internal_func_call(name, make_expr_group(args), with_runtime_context))