Exemplo n.º 1
0
        def _sdc_take_list_str_impl(data, indexes):
            res_size = 0
            for i in numba.prange(len(indexes)):
                res_size += len(indexes[i])
            nan_mask = numpy.zeros(res_size, dtype=numpy.bool_)
            num_total_bytes = 0
            for i in numba.prange(len(indexes)):
                start = 0
                for l in range(len(indexes[0:i])):
                    start += len(indexes[l])
                current_pos = start
                for j in range(len(indexes[i])):
                    num_total_bytes += get_utf8_size(data[indexes[i][j]])
                    if isna(data, indexes[i][j]):
                        nan_mask[current_pos] = True
                    current_pos += 1
            res_arr = pre_alloc_string_array(res_size, num_total_bytes)
            for i in numba.prange(len(indexes)):
                start = 0
                for l in range(len(indexes[0:i])):
                    start += len(indexes[l])
                current_pos = start
                for j in range(len(indexes[i])):
                    res_arr[current_pos] = data[indexes[i][j]]
                    if nan_mask[current_pos]:
                        str_arr_set_na(res_arr, current_pos)
                    current_pos += 1

            return res_arr
Exemplo n.º 2
0
        def getitem_str_impl(arr, slice_index, start, count):
            rank = sdc.distributed_api.get_rank()
            k = slice_index.stop

            # get total characters for allocation
            n_chars = np.uint64(0)
            if k > start:
                # if slice end is beyond the start of this subset we have to send our elements
                my_end = min(count, k - start)
                my_arr = arr[:my_end]
            else:
                my_arr = arr[:0]

            # get the total number of chars in our array, then gather all arrays into one
            # and compute total number of chars in all arrays
            n_chars = num_total_chars(my_arr)
            my_arr = sdc.distributed_api.gatherv(my_arr)
            n_chars = sdc.distributed_api.dist_reduce(n_chars, np.int32(reduce_op))

            if rank != 0:
                out_arr = pre_alloc_string_array(k, n_chars)
            else:
                out_arr = my_arr

            # actual communication
            sdc.distributed_api.bcast(out_arr)
            return out_arr
Exemplo n.º 3
0
 def prealloc_impl(arr):
     rank = sdc.distributed_api.get_rank()
     n_loc = bcast_scalar(len(arr))
     n_all_char = bcast_scalar(np.int64(num_total_chars(arr)))
     if rank != MPI_ROOT:
         arr = pre_alloc_string_array(n_loc, n_all_char)
     return arr
Exemplo n.º 4
0
 def set_string_to_array(A):
     # TODO: support unicode
     num_total_chars = num_total_chars_set_string(A)
     num_strs = len(A)
     str_arr = pre_alloc_string_array(num_strs, num_total_chars)
     populate_str_arr_from_set(A, str_arr)
     return str_arr
Exemplo n.º 5
0
def _str_replace_noregex_impl(str_arr, pat, val):
    numba.parfor.init_prange()
    n = len(str_arr)
    n_total_chars = 0
    str_list = sdc.str_ext.alloc_str_list(n)
    for i in numba.parfor.internal_prange(n):
        out_str = str_arr[i].replace(pat, val)
        str_list[i] = out_str
        n_total_chars += get_utf8_size(out_str)
    numba.parfor.init_prange()
    out_arr = pre_alloc_string_array(n, n_total_chars)
    for i in numba.parfor.internal_prange(n):
        _str = str_list[i]
        out_arr[i] = _str
    return sdc.hiframes.api.init_series(out_arr)
Exemplo n.º 6
0
        def sdc_astype_number_to_string_impl(self, dtype):
            num_bytes = 0
            arr_len = len(self)

            # Get total bytes for new array
            for i in prange(arr_len):
                item = self[i]
                num_bytes += get_utf8_size(str(item))

            data = pre_alloc_string_array(arr_len, num_bytes)

            for i in range(arr_len):
                item = self[i]
                data[i] = str(item)  # TODO: check NA

            return data
Exemplo n.º 7
0
        def _sdc_take_str_arr_impl(data, indexes):
            res_size = len(indexes)
            nan_mask = numpy.zeros(res_size, dtype=numpy.bool_)
            num_total_bytes = 0
            for i in numba.prange(res_size):
                num_total_bytes += get_utf8_size(data[indexes[i]])
                if isna(data, indexes[i]):
                    nan_mask[i] = True

            res_arr = pre_alloc_string_array(res_size, num_total_bytes)
            for i in numpy.arange(res_size):
                res_arr[i] = data[indexes[i]]
                if nan_mask[i]:
                    str_arr_set_na(res_arr, i)

            return res_arr
Exemplo n.º 8
0
        def sdc_astype_number_to_string_impl(self, dtype):
            num_bytes = 0
            arr_len = len(self)

            # Get total bytes for new array
            for i in np.arange(arr_len):    # FIXME_Numba#6969: prange segfaults, use it when resolved
                item = self[i]
                num_bytes += get_utf8_size(str(item))

            data = pre_alloc_string_array(arr_len, num_bytes)

            for i in range(arr_len):
                item = self[i]
                data[i] = str(item)  # TODO: check NA

            return data
Exemplo n.º 9
0
        def gatherv_str_arr_impl(data):
            rank = sdc.distributed_api.get_rank()
            n_loc = len(data)
            n_all_chars = num_total_chars(data)

            # allocate send lens arrays
            send_arr_lens = np.empty(n_loc, np.uint32)  # XXX offset type is uint32
            send_data_ptr = get_data_ptr(data)

            for i in range(n_loc):
                _str = data[i]
                send_arr_lens[i] = len(_str)

            recv_counts = gather_scalar(np.int32(n_loc))
            recv_counts_char = gather_scalar(np.int32(n_all_chars))
            n_total = recv_counts.sum()
            n_total_char = recv_counts_char.sum()

            # displacements
            all_data = StringArray([''])  # dummy arrays on non-root PEs
            displs = np.empty(0, np.int32)
            displs_char = np.empty(0, np.int32)

            if rank == MPI_ROOT:
                all_data = pre_alloc_string_array(n_total, n_total_char)
                displs = sdc.hiframes.join.calc_disp(recv_counts)
                displs_char = sdc.hiframes.join.calc_disp(recv_counts_char)

            offset_ptr = get_offset_ptr(all_data)
            data_ptr = get_data_ptr(all_data)
            c_gatherv(
                send_arr_lens.ctypes,
                np.int32(n_loc),
                offset_ptr,
                recv_counts.ctypes,
                displs.ctypes,
                int32_typ_enum)
            c_gatherv(
                send_data_ptr,
                np.int32(n_all_chars),
                data_ptr,
                recv_counts_char.ctypes,
                displs_char.ctypes,
                char_typ_enum)
            convert_len_arr_to_offset(offset_ptr, n_total)
            return all_data
Exemplo n.º 10
0
def ensure_capacity_str(arr, new_size, n_chars):
    # new_size is right after write index
    new_arr = arr
    curr_len = len(arr)
    curr_num_chars = num_total_chars(arr)
    needed_total_chars = getitem_str_offset(arr, new_size - 1) + n_chars

    # TODO: corner case test
    #print("new alloc", new_size, curr_len, getitem_str_offset(arr, new_size-1), n_chars, curr_num_chars)
    if curr_len < new_size or needed_total_chars > curr_num_chars:
        new_len = int(2 * curr_len if curr_len < new_size else curr_len)
        new_num_chars = int(
            2 * curr_num_chars +
            n_chars if needed_total_chars > curr_num_chars else curr_num_chars)
        new_arr = pre_alloc_string_array(new_len, new_num_chars)
        copy_str_arr_slice(new_arr, arr, new_size - 1)

    return new_arr
Exemplo n.º 11
0
            def sdc_fillna_str_impl(self, inplace=False, value=None):
                n = len(self)
                num_chars = 0
                # get total chars in new array
                for i in prange(n):
                    s = self[i]
                    if sdc.hiframes.api.isna(self, i):
                        num_chars += len(value)
                    else:
                        num_chars += len(s)

                filled_data = pre_alloc_string_array(n, num_chars)
                for i in prange(n):
                    if sdc.hiframes.api.isna(self, i):
                        filled_data[i] = value
                    else:
                        filled_data[i] = self[i]
                return filled_data
Exemplo n.º 12
0
            def sdc_fillna_str_impl(self, inplace=False, value=None):
                n = len(self)
                num_chars = 0
                # get total chars in new array
                for i in prange(n):
                    s = self[i]
                    if sdc.hiframes.api.isna(self, i):
                        num_chars += get_utf8_size(value)
                    else:
                        num_chars += get_utf8_size(s)

                filled_data = pre_alloc_string_array(n, num_chars)
                # StringArray doesn't support parallel setitem, thus no prange here
                for i in numpy.arange(n):
                    if sdc.hiframes.api.isna(self, i):
                        filled_data[i] = value
                    else:
                        filled_data[i] = self[i]
                return filled_data
Exemplo n.º 13
0
 def empty_like_type_str_arr(n, arr):
     # average character heuristic
     avg_chars = 20  # heuristic
     if len(arr) != 0:
         avg_chars = num_total_chars(arr) // len(arr)
     return pre_alloc_string_array(n, n * avg_chars)
Exemplo n.º 14
0
 def trim_arr_str(arr, size):
     # print("trim size", size, arr[size-1], getitem_str_offset(arr, size))
     new_arr = pre_alloc_string_array(
         size, np.int64(getitem_str_offset(arr, size)))
     copy_str_arr_slice(new_arr, arr, size)
     return new_arr
Exemplo n.º 15
0
        def sdc_join_series_indexes_impl(left, right):

            # allocate result arrays
            lsize = len(left)
            rsize = len(right)
            est_total_size = int(1.1 * (lsize + rsize))

            lidx = numpy.empty(est_total_size, numpy.int64)
            ridx = numpy.empty(est_total_size, numpy.int64)

            # use Series.sort_values since argsort for StringArrays not implemented
            original_left_series = pandas.Series(left)
            original_right_series = pandas.Series(right)

            # sort arrays saving the old positions
            left_series = original_left_series.sort_values(kind='mergesort')
            right_series = original_right_series.sort_values(kind='mergesort')
            sorted_left = left_series._index
            sorted_right = right_series._index

            i, j, k = 0, 0, 0
            while (i < lsize and j < rsize):
                lidx = _hpat_ensure_array_capacity(k + 1, lidx)
                ridx = _hpat_ensure_array_capacity(k + 1, ridx)

                left_index = left[sorted_left[i]]
                right_index = right[sorted_right[j]]

                if (left_index < right_index):
                    lidx[k] = sorted_left[i]
                    ridx[k] = -1
                    i += 1
                    k += 1
                elif (left_index > right_index):
                    lidx[k] = -1
                    ridx[k] = sorted_right[j]
                    j += 1
                    k += 1
                else:
                    # find ends of sequences of equal index values in left and right
                    ni, nj = i, j
                    while (ni < lsize and left[sorted_left[ni]] == left_index):
                        ni += 1
                    while (nj < rsize
                           and right[sorted_right[nj]] == right_index):
                        nj += 1

                    # join the blocks found into results
                    for s in numpy.arange(i, ni, 1):
                        block_size = nj - j
                        to_lidx = numpy.repeat(sorted_left[s], block_size)
                        to_ridx = numpy.array(
                            [sorted_right[k] for k in numpy.arange(j, nj, 1)],
                            numpy.int64)

                        lidx = _hpat_ensure_array_capacity(
                            k + block_size, lidx)
                        ridx = _hpat_ensure_array_capacity(
                            k + block_size, ridx)

                        lidx[k:k + block_size] = to_lidx
                        ridx[k:k + block_size] = to_ridx
                        k += block_size
                    i = ni
                    j = nj

            # fill the end of joined with remaining part of left or right
            if i < lsize:
                block_size = lsize - i
                lidx = _hpat_ensure_array_capacity(k + block_size, lidx)
                ridx = _hpat_ensure_array_capacity(k + block_size, ridx)
                ridx[k:k + block_size] = numpy.repeat(-1, block_size)
                while i < lsize:
                    lidx[k] = sorted_left[i]
                    i += 1
                    k += 1

            elif j < rsize:
                block_size = rsize - j
                lidx = _hpat_ensure_array_capacity(k + block_size, lidx)
                ridx = _hpat_ensure_array_capacity(k + block_size, ridx)
                lidx[k:k + block_size] = numpy.repeat(-1, block_size)
                while j < rsize:
                    ridx[k] = sorted_right[j]
                    j += 1
                    k += 1

            # count total number of characters and allocate joined array
            total_joined_size = k
            num_chars_in_joined = 0
            for i in numpy.arange(total_joined_size):
                if lidx[i] != -1:
                    num_chars_in_joined += len(left[lidx[i]])
                elif ridx[i] != -1:
                    num_chars_in_joined += len(right[ridx[i]])

            joined = pre_alloc_string_array(total_joined_size,
                                            num_chars_in_joined)

            # iterate over joined and fill it with indexes using lidx and ridx indexers
            for i in numpy.arange(total_joined_size):
                if lidx[i] != -1:
                    joined[i] = left[lidx[i]]
                    if (str_arr_is_na(left, lidx[i])):
                        str_arr_set_na(joined, i)
                elif ridx[i] != -1:
                    joined[i] = right[ridx[i]]
                    if (str_arr_is_na(right, ridx[i])):
                        str_arr_set_na(joined, i)
                else:
                    str_arr_set_na(joined, i)

            return joined, lidx, ridx