def dropna_impl(arr, idx, name): chunks = parallel_chunks(len(arr)) arr_len = numpy.empty(len(chunks), dtype=numpy.int64) length = 0 for i in prange(len(chunks)): chunk = chunks[i] res = 0 for j in range(chunk.start, chunk.stop): if not isnan(arr[j]): res += 1 length += res arr_len[i] = res result_data = numpy.empty(shape=length, dtype=dtype) result_index = numpy.empty(shape=length, dtype=dtype_idx) for i in prange(len(chunks)): chunk = chunks[i] new_start = int(sum(arr_len[0:i])) new_stop = new_start + arr_len[i] current_pos = new_start for j in range(chunk.start, chunk.stop): if not isnan(arr[j]): result_data[current_pos] = arr[j] result_index[current_pos] = idx[j] current_pos += 1 return pandas.Series(result_data, result_index, name)
def nancumsum_impl(arr, like_pandas=False): chunks = parallel_chunks(len(arr)) partial_sum = numpy.zeros(len(chunks), dtype=retty) result = numpy.empty_like(arr) # below line is only needed since Literal[bool] var cannot be converted to bool # in a prange due to a bug related to absence of BooleanLiterals in Numba _like_pandas = True if like_pandas else False for i in prange(len(chunks)): chunk = chunks[i] partial = zero for j in range(chunk.start, chunk.stop): if _like_pandas: result[j] = partial + arr[j] if ~is_nan(arr[j]): partial = result[j] else: if ~is_nan(arr[j]): partial += arr[j] result[j] = partial partial_sum[i] = partial for i in prange(len(chunks)): prefix = sum(partial_sum[0:i]) chunk = chunks[i] for j in range(chunk.start, chunk.stop): result[j] += prefix return result
def sdc_nanargmin_impl(self): chunks = parallel_chunks(len(self)) arr_res = numpy.empty(shape=len(chunks), dtype=dtype) arr_pos = numpy.empty(shape=len(chunks), dtype=numpy.int64) for i in prange(len(chunks)): chunk = chunks[i] res = initial_result pos = max_int64 for j in range(chunk.start, chunk.stop): if reduce_op(res, self[j]) != self[j]: continue if isnan(self[j]): continue if res == self[j]: pos = min(pos, j) else: pos = j res = self[j] arr_res[i] = res arr_pos[i] = pos general_res = initial_result general_pos = max_int64 for i in range(len(chunks)): if reduce_op(general_res, arr_res[i]) != arr_res[i]: continue if general_res == arr_res[i]: general_pos = min(general_pos, arr_pos[i]) else: general_pos = arr_pos[i] general_res = arr_res[i] return general_pos
def nancumsum_impl(arr, like_pandas=False): chunks = parallel_chunks(len(arr)) partial_sum = numpy.zeros(len(chunks), dtype=retty) result = numpy.empty_like(arr) for i in prange(len(chunks)): chunk = chunks[i] partial = zero for j in range(chunk.start, chunk.stop): if like_pandas: result[j] = partial + arr[j] if ~is_nan(arr[j]): partial = result[j] else: if ~is_nan(arr[j]): partial += arr[j] result[j] = partial partial_sum[i] = partial for i in prange(len(chunks)): prefix = sum(partial_sum[0:i]) chunk = chunks[i] for j in range(chunk.start, chunk.stop): result[j] += prefix return result
def find_idx_impl(arr, idx): chunks = parallel_chunks(len(arr)) new_arr = [List.empty_list(types.int64) for i in range(len(chunks))] for i in prange(len(chunks)): chunk = chunks[i] for j in range(chunk.start, chunk.stop): if arr[j] == idx: new_arr[i].append(j) return new_arr
def apply_converter_to_column_impl(table, col_idx, func): table_size = len(table) prange_chunks = parallel_chunks(table_size) n_chunks = len(prange_chunks) res = df_alloc_column_of_dtype(res_dtype, table_size) for j in numba.prange(n_chunks): res_storage = alloc_res_storage(col_dtype) for i in range(prange_chunks[j].start, prange_chunks[j].stop): arrow_reader_get_table_cell(table, col_idx_literal_val, i, res_storage) cell_val = read_from_storage(col_dtype, res_storage) res[i] = func(cell_val) # no release_res_storage as meminfo managed objects are freed automatically return res
def impl(self): win = self._window minp = self._min_periods input_series = self._data input_arr = input_series._data length = len(input_arr) output_arr = numpy.empty(length, dtype=float64) chunks = parallel_chunks(length) for i in prange(len(chunks)): chunk = chunks[i] nfinite = 0 result = init_result if win == 0: for idx in range(chunk.start, chunk.stop): output_arr[idx] = result_or_nan(nfinite, minp, result) continue prelude_start = max(0, chunk.start - win + 1) prelude_stop = chunk.start interlude_start = prelude_stop interlude_stop = min(prelude_start + win, chunk.stop) for idx in range(prelude_start, prelude_stop): value = input_arr[idx] nfinite, result = put(value, nfinite, result) for idx in range(interlude_start, interlude_stop): value = input_arr[idx] nfinite, result = put(value, nfinite, result) output_arr[idx] = result_or_nan(nfinite, minp, result) for idx in range(interlude_stop, chunk.stop): put_value = input_arr[idx] pop_value = input_arr[idx - win] nfinite, result = put(put_value, nfinite, result) nfinite, result = pop(pop_value, nfinite, result, input_arr, idx, win) output_arr[idx] = result_or_nan(nfinite, minp, result) return pandas.Series(output_arr, input_series._index, name=input_series._name)
def cumsum_impl(arr): chunks = parallel_chunks(len(arr)) partial_sum = numpy.zeros(len(chunks), dtype=retty) result = numpy.empty_like(arr) for i in prange(len(chunks)): chunk = chunks[i] partial = zero for j in range(chunk.start, chunk.stop): result[j] = partial + arr[j] partial = result[j] partial_sum[i] = partial for i in prange(len(chunks)): prefix = sum(partial_sum[0:i]) chunk = chunks[i] for j in range(chunk.start, chunk.stop): result[j] += prefix return result
def getitem_by_mask_impl(arr, idx): chunks = parallel_chunks(len(arr)) arr_len = numpy.empty(len(chunks), dtype=numpy.int64) length = 0 for i in prange(len(chunks)): chunk = chunks[i] res = 0 for j in range(chunk.start, chunk.stop): if idx[j]: res += 1 length += res arr_len[i] = res if is_str_arr == True: # noqa result_data = [''] * length result_nan_mask = numpy.empty(shape=length, dtype=types.bool_) else: result_data = numpy.empty(shape=length, dtype=res_dtype) for i in prange(len(chunks)): chunk = chunks[i] new_start = int(sum(arr_len[0:i])) current_pos = new_start for j in range(chunk.start, chunk.stop): if idx[j]: if is_range == True: # noqa value = arr.start + arr.step * j else: value = arr[j] result_data[current_pos] = value if is_str_arr == True: # noqa result_nan_mask[current_pos] = isna(arr, j) current_pos += 1 if is_str_arr == True: # noqa result_data_as_str_arr = create_str_arr_from_list(result_data) str_arr_set_na_by_mask(result_data_as_str_arr, result_nan_mask) return result_data_as_str_arr else: return result_data
def _impl(self, other=None, pairwise=None, ddof=1): win = self._window minp = self._min_periods main_series = self._data main_arr = main_series._data if nan_other == True: # noqa other_arr = main_arr else: other_arr = other._data main_arr_length = len(main_arr) other_arr_length = len(other_arr) min_length = min(main_arr_length, other_arr_length) length = max(main_arr_length, other_arr_length) output_arr = numpy.empty(length, dtype=float64) chunks = parallel_chunks(length) for i in prange(len(chunks)): chunk = chunks[i] nfinite = 0 result = (0., 0., 0., 0.) if win == 0: for idx in range(chunk.start, chunk.stop): output_arr[idx] = cov_result_or_nan(nfinite, minp, result, ddof) continue prelude_start = max(0, chunk.start - win + 1) prelude_stop = min(chunk.start, min_length) interlude_start = chunk.start interlude_stop = min(prelude_start + win, chunk.stop, min_length) postlude_start = min(prelude_start + win, chunk.stop) postlude_stop = min(chunk.stop, min_length) for idx in range(prelude_start, prelude_stop): x, y = main_arr[idx], other_arr[idx] nfinite, result = put_cov(x, y, nfinite, result, align_finiteness=align_finiteness) for idx in range(interlude_start, interlude_stop): x, y = main_arr[idx], other_arr[idx] nfinite, result = put_cov(x, y, nfinite, result, align_finiteness=align_finiteness) output_arr[idx] = cov_result_or_nan(nfinite, minp, result, ddof) for idx in range(postlude_start, postlude_stop): put_x, put_y = main_arr[idx], other_arr[idx] pop_x, pop_y = main_arr[idx - win], other_arr[idx - win] nfinite, result = put_cov(put_x, put_y, nfinite, result, align_finiteness=align_finiteness) nfinite, result = pop_cov(pop_x, pop_y, nfinite, result, align_finiteness=align_finiteness) output_arr[idx] = cov_result_or_nan(nfinite, minp, result, ddof) last_start = max(min_length, interlude_start) for idx in range(last_start, postlude_start): output_arr[idx] = cov_result_or_nan(nfinite, minp, result, ddof) last_start = max(min_length, postlude_start) last_stop = min(min_length + win, chunk.stop) for idx in range(last_start, last_stop): x, y = main_arr[idx - win], other_arr[idx - win] nfinite, result = pop_cov(x, y, nfinite, result, align_finiteness=align_finiteness) output_arr[idx] = cov_result_or_nan(nfinite, minp, result, ddof) for idx in range(last_stop, chunk.stop): output_arr[idx] = numpy.nan return pandas.Series(output_arr)