def _sdc_take_list_str_impl(data, indexes): res_size = 0 for i in numba.prange(len(indexes)): res_size += len(indexes[i]) nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) num_total_bytes = 0 for i in numba.prange(len(indexes)): start = 0 for l in range(len(indexes[0:i])): start += len(indexes[l]) current_pos = start for j in range(len(indexes[i])): num_total_bytes += get_utf8_size(data[indexes[i][j]]) if isna(data, indexes[i][j]): nan_mask[current_pos] = True current_pos += 1 res_arr = pre_alloc_string_array(res_size, num_total_bytes) for i in numba.prange(len(indexes)): start = 0 for l in range(len(indexes[0:i])): start += len(indexes[l]) current_pos = start for j in range(len(indexes[i])): res_arr[current_pos] = data[indexes[i][j]] if nan_mask[current_pos]: str_arr_set_na(res_arr, current_pos) current_pos += 1 return res_arr
def getitem_str_impl(arr, slice_index, start, count): rank = sdc.distributed_api.get_rank() k = slice_index.stop # get total characters for allocation n_chars = np.uint64(0) if k > start: # if slice end is beyond the start of this subset we have to send our elements my_end = min(count, k - start) my_arr = arr[:my_end] else: my_arr = arr[:0] # get the total number of chars in our array, then gather all arrays into one # and compute total number of chars in all arrays n_chars = num_total_chars(my_arr) my_arr = sdc.distributed_api.gatherv(my_arr) n_chars = sdc.distributed_api.dist_reduce(n_chars, np.int32(reduce_op)) if rank != 0: out_arr = pre_alloc_string_array(k, n_chars) else: out_arr = my_arr # actual communication sdc.distributed_api.bcast(out_arr) return out_arr
def prealloc_impl(arr): rank = sdc.distributed_api.get_rank() n_loc = bcast_scalar(len(arr)) n_all_char = bcast_scalar(np.int64(num_total_chars(arr))) if rank != MPI_ROOT: arr = pre_alloc_string_array(n_loc, n_all_char) return arr
def set_string_to_array(A): # TODO: support unicode num_total_chars = num_total_chars_set_string(A) num_strs = len(A) str_arr = pre_alloc_string_array(num_strs, num_total_chars) populate_str_arr_from_set(A, str_arr) return str_arr
def _str_replace_noregex_impl(str_arr, pat, val): numba.parfor.init_prange() n = len(str_arr) n_total_chars = 0 str_list = sdc.str_ext.alloc_str_list(n) for i in numba.parfor.internal_prange(n): out_str = str_arr[i].replace(pat, val) str_list[i] = out_str n_total_chars += get_utf8_size(out_str) numba.parfor.init_prange() out_arr = pre_alloc_string_array(n, n_total_chars) for i in numba.parfor.internal_prange(n): _str = str_list[i] out_arr[i] = _str return sdc.hiframes.api.init_series(out_arr)
def sdc_astype_number_to_string_impl(self, dtype): num_bytes = 0 arr_len = len(self) # Get total bytes for new array for i in prange(arr_len): item = self[i] num_bytes += get_utf8_size(str(item)) data = pre_alloc_string_array(arr_len, num_bytes) for i in range(arr_len): item = self[i] data[i] = str(item) # TODO: check NA return data
def _sdc_take_str_arr_impl(data, indexes): res_size = len(indexes) nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) num_total_bytes = 0 for i in numba.prange(res_size): num_total_bytes += get_utf8_size(data[indexes[i]]) if isna(data, indexes[i]): nan_mask[i] = True res_arr = pre_alloc_string_array(res_size, num_total_bytes) for i in numpy.arange(res_size): res_arr[i] = data[indexes[i]] if nan_mask[i]: str_arr_set_na(res_arr, i) return res_arr
def sdc_astype_number_to_string_impl(self, dtype): num_bytes = 0 arr_len = len(self) # Get total bytes for new array for i in np.arange(arr_len): # FIXME_Numba#6969: prange segfaults, use it when resolved item = self[i] num_bytes += get_utf8_size(str(item)) data = pre_alloc_string_array(arr_len, num_bytes) for i in range(arr_len): item = self[i] data[i] = str(item) # TODO: check NA return data
def gatherv_str_arr_impl(data): rank = sdc.distributed_api.get_rank() n_loc = len(data) n_all_chars = num_total_chars(data) # allocate send lens arrays send_arr_lens = np.empty(n_loc, np.uint32) # XXX offset type is uint32 send_data_ptr = get_data_ptr(data) for i in range(n_loc): _str = data[i] send_arr_lens[i] = len(_str) recv_counts = gather_scalar(np.int32(n_loc)) recv_counts_char = gather_scalar(np.int32(n_all_chars)) n_total = recv_counts.sum() n_total_char = recv_counts_char.sum() # displacements all_data = StringArray(['']) # dummy arrays on non-root PEs displs = np.empty(0, np.int32) displs_char = np.empty(0, np.int32) if rank == MPI_ROOT: all_data = pre_alloc_string_array(n_total, n_total_char) displs = sdc.hiframes.join.calc_disp(recv_counts) displs_char = sdc.hiframes.join.calc_disp(recv_counts_char) offset_ptr = get_offset_ptr(all_data) data_ptr = get_data_ptr(all_data) c_gatherv( send_arr_lens.ctypes, np.int32(n_loc), offset_ptr, recv_counts.ctypes, displs.ctypes, int32_typ_enum) c_gatherv( send_data_ptr, np.int32(n_all_chars), data_ptr, recv_counts_char.ctypes, displs_char.ctypes, char_typ_enum) convert_len_arr_to_offset(offset_ptr, n_total) return all_data
def ensure_capacity_str(arr, new_size, n_chars): # new_size is right after write index new_arr = arr curr_len = len(arr) curr_num_chars = num_total_chars(arr) needed_total_chars = getitem_str_offset(arr, new_size - 1) + n_chars # TODO: corner case test #print("new alloc", new_size, curr_len, getitem_str_offset(arr, new_size-1), n_chars, curr_num_chars) if curr_len < new_size or needed_total_chars > curr_num_chars: new_len = int(2 * curr_len if curr_len < new_size else curr_len) new_num_chars = int( 2 * curr_num_chars + n_chars if needed_total_chars > curr_num_chars else curr_num_chars) new_arr = pre_alloc_string_array(new_len, new_num_chars) copy_str_arr_slice(new_arr, arr, new_size - 1) return new_arr
def sdc_fillna_str_impl(self, inplace=False, value=None): n = len(self) num_chars = 0 # get total chars in new array for i in prange(n): s = self[i] if sdc.hiframes.api.isna(self, i): num_chars += len(value) else: num_chars += len(s) filled_data = pre_alloc_string_array(n, num_chars) for i in prange(n): if sdc.hiframes.api.isna(self, i): filled_data[i] = value else: filled_data[i] = self[i] return filled_data
def sdc_fillna_str_impl(self, inplace=False, value=None): n = len(self) num_chars = 0 # get total chars in new array for i in prange(n): s = self[i] if sdc.hiframes.api.isna(self, i): num_chars += get_utf8_size(value) else: num_chars += get_utf8_size(s) filled_data = pre_alloc_string_array(n, num_chars) # StringArray doesn't support parallel setitem, thus no prange here for i in numpy.arange(n): if sdc.hiframes.api.isna(self, i): filled_data[i] = value else: filled_data[i] = self[i] return filled_data
def empty_like_type_str_arr(n, arr): # average character heuristic avg_chars = 20 # heuristic if len(arr) != 0: avg_chars = num_total_chars(arr) // len(arr) return pre_alloc_string_array(n, n * avg_chars)
def trim_arr_str(arr, size): # print("trim size", size, arr[size-1], getitem_str_offset(arr, size)) new_arr = pre_alloc_string_array( size, np.int64(getitem_str_offset(arr, size))) copy_str_arr_slice(new_arr, arr, size) return new_arr
def sdc_join_series_indexes_impl(left, right): # allocate result arrays lsize = len(left) rsize = len(right) est_total_size = int(1.1 * (lsize + rsize)) lidx = numpy.empty(est_total_size, numpy.int64) ridx = numpy.empty(est_total_size, numpy.int64) # use Series.sort_values since argsort for StringArrays not implemented original_left_series = pandas.Series(left) original_right_series = pandas.Series(right) # sort arrays saving the old positions left_series = original_left_series.sort_values(kind='mergesort') right_series = original_right_series.sort_values(kind='mergesort') sorted_left = left_series._index sorted_right = right_series._index i, j, k = 0, 0, 0 while (i < lsize and j < rsize): lidx = _hpat_ensure_array_capacity(k + 1, lidx) ridx = _hpat_ensure_array_capacity(k + 1, ridx) left_index = left[sorted_left[i]] right_index = right[sorted_right[j]] if (left_index < right_index): lidx[k] = sorted_left[i] ridx[k] = -1 i += 1 k += 1 elif (left_index > right_index): lidx[k] = -1 ridx[k] = sorted_right[j] j += 1 k += 1 else: # find ends of sequences of equal index values in left and right ni, nj = i, j while (ni < lsize and left[sorted_left[ni]] == left_index): ni += 1 while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_lidx = numpy.repeat(sorted_left[s], block_size) to_ridx = numpy.array( [sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) lidx = _hpat_ensure_array_capacity( k + block_size, lidx) ridx = _hpat_ensure_array_capacity( k + block_size, ridx) lidx[k:k + block_size] = to_lidx ridx[k:k + block_size] = to_ridx k += block_size i = ni j = nj # fill the end of joined with remaining part of left or right if i < lsize: block_size = lsize - i lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) ridx[k:k + block_size] = numpy.repeat(-1, block_size) while i < lsize: lidx[k] = sorted_left[i] i += 1 k += 1 elif j < rsize: block_size = rsize - j lidx = _hpat_ensure_array_capacity(k + block_size, lidx) ridx = _hpat_ensure_array_capacity(k + block_size, ridx) lidx[k:k + block_size] = numpy.repeat(-1, block_size) while j < rsize: ridx[k] = sorted_right[j] j += 1 k += 1 # count total number of characters and allocate joined array total_joined_size = k num_chars_in_joined = 0 for i in numpy.arange(total_joined_size): if lidx[i] != -1: num_chars_in_joined += len(left[lidx[i]]) elif ridx[i] != -1: num_chars_in_joined += len(right[ridx[i]]) joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) # iterate over joined and fill it with indexes using lidx and ridx indexers for i in numpy.arange(total_joined_size): if lidx[i] != -1: joined[i] = left[lidx[i]] if (str_arr_is_na(left, lidx[i])): str_arr_set_na(joined, i) elif ridx[i] != -1: joined[i] = right[ridx[i]] if (str_arr_is_na(right, ridx[i])): str_arr_set_na(joined, i) else: str_arr_set_na(joined, i) return joined, lidx, ridx