def generic(self, args, kws): from hpat.str_arr_ext import is_str_arr_typ assert not kws [va, vb] = args # if one of the inputs is string array if is_str_series_typ(va) or is_str_series_typ(vb): # inputs should be either string array or string assert is_str_arr_typ(va) or va == string_type assert is_str_arr_typ(vb) or vb == string_type return signature(SeriesType(types.boolean), va, vb) if ((is_dt64_series_typ(va) and vb == string_type) or (is_dt64_series_typ(vb) and va == string_type)): return signature(SeriesType(types.boolean), va, vb)
def concat_overload(arr_list): # all string input case # TODO: handle numerics to string casting case if (isinstance(arr_list, types.UniTuple) and is_str_arr_typ(arr_list.dtype)): def string_concat_impl(in_arrs): # preallocate the output num_strs = 0 num_chars = 0 for A in in_arrs: arr = dummy_unbox_series(A) num_strs += len(arr) num_chars += hpat.str_arr_ext.num_total_chars(arr) out_arr = hpat.str_arr_ext.pre_alloc_string_array(num_strs, num_chars) # copy data to output curr_str_ind = 0 curr_chars_ind = 0 for A in in_arrs: arr = dummy_unbox_series(A) hpat.str_arr_ext.set_string_array_range( out_arr, arr, curr_str_ind, curr_chars_ind) curr_str_ind += len(arr) curr_chars_ind += hpat.str_arr_ext.num_total_chars(arr) return out_arr return string_concat_impl for typ in arr_list: if not isinstance(typ, types.Array): raise ValueError("concat supports only numerical and string arrays") # numerical input return lambda a: np.concatenate(dummy_unbox_series(a))
def _handle_string_array_expr(self, lhs, rhs, assign): # convert str_arr==str into parfor if (rhs.op == 'binop' and rhs.fn in ['==', '!=', '>=', '>', '<=', '<'] and (is_str_arr_typ(self.typemap[rhs.lhs.name]) or is_str_arr_typ(self.typemap[rhs.rhs.name]))): arg1 = rhs.lhs arg2 = rhs.rhs arg1_access = 'A' arg2_access = 'B' len_call = 'len(A)' if is_str_arr_typ(self.typemap[arg1.name]): arg1_access = 'A[i]' # replace type now for correct typing of len, etc. self.typemap.pop(arg1.name) self.typemap[arg1.name] = string_array_type if is_str_arr_typ(self.typemap[arg2.name]): arg1_access = 'B[i]' len_call = 'len(B)' self.typemap.pop(arg2.name) self.typemap[arg2.name] = string_array_type func_text = 'def f(A, B):\n' func_text += ' l = {}\n'.format(len_call) func_text += ' S = np.empty(l, dtype=np.bool_)\n' func_text += ' for i in numba.parfor.internal_prange(l):\n' func_text += ' S[i] = {} {} {}\n'.format( arg1_access, rhs.fn, arg2_access) loc_vars = {} exec(func_text, {}, loc_vars) f = loc_vars['f'] f_blocks = compile_to_numba_ir( f, { 'numba': numba, 'np': np }, self.typingctx, (if_series_to_array_type(self.typemap[arg1.name]), if_series_to_array_type(self.typemap[arg2.name])), self.typemap, self.calltypes).blocks replace_arg_nodes(f_blocks[min(f_blocks.keys())], [arg1, arg2]) # replace == expression with result of parfor (S) # S is target of last statement in 1st block of f assign.value = f_blocks[min(f_blocks.keys())].body[-2].target return (f_blocks, [assign]) return None
def generic(self, args, kws): assert not kws assert len(args) == 1 arr_list = args[0] if (isinstance(arr_list, types.UniTuple) and is_str_arr_typ(arr_list.dtype)): ret_typ = string_array_type else: # use typer of np.concatenate arr_list_to_arr = if_series_to_array_type(arr_list) ret_typ = numba.typing.npydecl.NdConcatenate(self.context).generic()(arr_list_to_arr) return signature(ret_typ, arr_list)
def init_set_string_array(in_typ): if is_str_arr_typ(in_typ): def f(A): str_arr = dummy_unbox_series(A) str_set = init_set_string() n = len(str_arr) for i in range(n): str = str_arr[i] str_set.add(str) hpat.str_ext.del_str(str) return str_set return f
def populate_str_arr_from_set(typingctx, in_set_typ, in_str_arr_typ=None): assert in_set_typ == set_string_type assert is_str_arr_typ(in_str_arr_typ) def codegen(context, builder, sig, args): in_set, in_str_arr = args string_array = context.make_helper(builder, string_array_type, in_str_arr) fnty = lir.FunctionType(lir.VoidType(), [ lir.IntType(8).as_pointer(), lir.IntType(32).as_pointer(), lir.IntType(8).as_pointer(), ]) fn_getitem = builder.module.get_or_insert_function( fnty, name="populate_str_arr_from_set") builder.call(fn_getitem, [in_set, string_array.offsets, string_array.data]) return context.get_dummy_value() return types.void(set_string_type, string_array_type), codegen
def build_set(A): if is_str_arr_typ(A): return _build_str_set_impl else: return lambda A: set(A)
def init_set_string_array(A): if is_str_arr_typ(A): return _build_str_set_impl
def nunique_overload_parallel(arr_typ): # TODO: extend to other types sum_op = hpat.distributed_api.Reduce_Type.Sum.value if is_str_arr_typ(arr_typ): int32_typ_enum = np.int32(_h5_typ_table[types.int32]) char_typ_enum = np.int32(_h5_typ_table[types.uint8]) def nunique_par_str(A): uniq_A = hpat.utils.to_array(set(A)) n_strs = len(uniq_A) n_pes = hpat.distributed_api.get_size() # send recv counts for the number of strings send_counts, recv_counts = hpat.hiframes_join.send_recv_counts_new( uniq_A) send_disp = hpat.hiframes_join.calc_disp(send_counts) recv_disp = hpat.hiframes_join.calc_disp(recv_counts) recv_size = recv_counts.sum() # send recv counts for the number of chars send_chars_count, recv_chars_count = set_recv_counts_chars(uniq_A) send_disp_chars = hpat.hiframes_join.calc_disp(send_chars_count) recv_disp_chars = hpat.hiframes_join.calc_disp(recv_chars_count) recv_num_chars = recv_chars_count.sum() n_all_chars = hpat.str_arr_ext.num_total_chars(uniq_A) # allocate send recv arrays send_arr_lens = np.empty(n_strs, np.uint32) # XXX offset type is uint32 send_arr_chars = np.empty(n_all_chars, np.uint8) recv_arr = hpat.str_arr_ext.pre_alloc_string_array( recv_size, recv_num_chars) # populate send array tmp_offset = np.zeros(n_pes, dtype=np.int64) tmp_offset_chars = np.zeros(n_pes, dtype=np.int64) for i in range(n_strs): str = uniq_A[i] node_id = hash(str) % n_pes # lens ind = send_disp[node_id] + tmp_offset[node_id] send_arr_lens[ind] = len(str) tmp_offset[node_id] += 1 # chars indc = send_disp_chars[node_id] + tmp_offset_chars[node_id] str_copy(send_arr_chars, indc, str.c_str(), len(str)) tmp_offset_chars[node_id] += len(str) hpat.str_ext.del_str(str) # shuffle len values offset_ptr = hpat.str_arr_ext.get_offset_ptr(recv_arr) c_alltoallv(send_arr_lens.ctypes, offset_ptr, send_counts.ctypes, recv_counts.ctypes, send_disp.ctypes, recv_disp.ctypes, int32_typ_enum) data_ptr = hpat.str_arr_ext.get_data_ptr(recv_arr) # shuffle char values c_alltoallv(send_arr_chars.ctypes, data_ptr, send_chars_count.ctypes, recv_chars_count.ctypes, send_disp_chars.ctypes, recv_disp_chars.ctypes, char_typ_enum) convert_len_arr_to_offset(offset_ptr, recv_size) loc_nuniq = len(set(recv_arr)) return hpat.distributed_api.dist_reduce(loc_nuniq, np.int32(sum_op)) return nunique_par_str assert arr_typ == types.Array(types.int64, 1, 'C'), "only in64 for parallel nunique" def nunique_par(A): uniq_A = hpat.utils.to_array(set(A)) send_counts, recv_counts = hpat.hiframes_join.send_recv_counts_new( uniq_A) send_disp = hpat.hiframes_join.calc_disp(send_counts) recv_disp = hpat.hiframes_join.calc_disp(recv_counts) recv_size = recv_counts.sum() # (send_counts, recv_counts, send_disp, recv_disp, # recv_size) = hpat.hiframes_join.get_sendrecv_counts(uniq_A) send_arr = np.empty_like(uniq_A) recv_arr = np.empty(recv_size, uniq_A.dtype) hpat.hiframes_join.shuffle_data(send_counts, recv_counts, send_disp, recv_disp, uniq_A, send_arr, recv_arr) loc_nuniq = len(set(recv_arr)) return hpat.distributed_api.dist_reduce(loc_nuniq, np.int32(sum_op)) return nunique_par