예제 #1
0
파일: join.py 프로젝트: stjordanis/hpat
def parallel_join_impl(key_arrs, data):
    # alloc shuffle meta
    n_pes = hpat.distributed_api.get_size()
    pre_shuffle_meta = alloc_pre_shuffle_metadata(key_arrs, data, n_pes, False)


    # calc send/recv counts
    for i in range(len(key_arrs[0])):
        val = getitem_arr_tup_single(key_arrs, i)
        node_id = hash(val) % n_pes
        update_shuffle_meta(pre_shuffle_meta, node_id, i, val_to_tup(val),
            getitem_arr_tup(data, i), False)

    shuffle_meta = finalize_shuffle_meta(key_arrs, data, pre_shuffle_meta,
                                          n_pes, False)

    # write send buffers
    for i in range(len(key_arrs[0])):
        val = getitem_arr_tup_single(key_arrs, i)
        node_id = hash(val) % n_pes
        write_send_buff(shuffle_meta, node_id, i, val_to_tup(val), data)
        # update last since it is reused in data
        shuffle_meta.tmp_offset[node_id] += 1

    # shuffle
    recvs = alltoallv_tup(key_arrs + data, shuffle_meta)
    out_keys = _get_keys_tup(recvs, key_arrs)
    out_data = _get_data_tup(recvs, key_arrs)

    return out_keys, out_data
예제 #2
0
파일: sort.py 프로젝트: stjordanis/hpat
def parallel_sort(key_arrs, data, ascending=True):
    n_local = len(key_arrs[0])
    n_total = hpat.distributed_api.dist_reduce(n_local,
                                               np.int32(Reduce_Type.Sum.value))

    n_pes = hpat.distributed_api.get_size()
    my_rank = hpat.distributed_api.get_rank()

    # similar to Spark's sample computation Partitioner.scala
    sampleSize = min(samplePointsPerPartitionHint * n_pes, MIN_SAMPLES)

    fraction = min(sampleSize / max(n_total, 1), 1.0)
    n_loc_samples = min(math.ceil(fraction * n_local), n_local)
    inds = np.random.randint(0, n_local, n_loc_samples)
    samples = key_arrs[0][inds]
    # print(sampleSize, fraction, n_local, n_loc_samples, len(samples))

    all_samples = hpat.distributed_api.gatherv(samples)
    all_samples = to_string_list(all_samples)
    bounds = empty_like_type(n_pes - 1, all_samples)

    if my_rank == MPI_ROOT:
        all_samples.sort()
        if not ascending:
            all_samples = all_samples[::-1]
        n_samples = len(all_samples)
        step = math.ceil(n_samples / n_pes)
        for i in range(n_pes - 1):
            bounds[i] = all_samples[min((i + 1) * step, n_samples - 1)]
        # print(bounds)

    bounds = str_list_to_array(bounds)
    bounds = hpat.distributed_api.prealloc_str_for_bcast(bounds)
    hpat.distributed_api.bcast(bounds)

    # calc send/recv counts
    pre_shuffle_meta = alloc_pre_shuffle_metadata(key_arrs, data, n_pes, True)
    node_id = 0
    for i in range(n_local):
        val = key_arrs[0][i]
        # TODO: refactor
        if node_id < (n_pes - 1) and (ascending and val >= bounds[node_id] or
                                      (not ascending)
                                      and val <= bounds[node_id]):
            node_id += 1
        update_shuffle_meta(pre_shuffle_meta, node_id, i, (val, ),
                            getitem_arr_tup(data, i), True)

    shuffle_meta = finalize_shuffle_meta(key_arrs, data, pre_shuffle_meta,
                                         n_pes, True)

    # shuffle
    recvs = alltoallv_tup(key_arrs + data, shuffle_meta)
    out_key = _get_keys_tup(recvs, key_arrs)
    out_data = _get_data_tup(recvs, key_arrs)

    return out_key, out_data
예제 #3
0
def local_merge_asof(left_key, right_key, data_left, data_right):
    # adapted from pandas/_libs/join_func_helper.pxi
    l_size = len(left_key)
    r_size = len(right_key)

    out_left_key = empty_like_type(l_size, left_key)
    out_right_key = empty_like_type(l_size, right_key)
    out_data_left = alloc_arr_tup(l_size, data_left)
    out_data_right = alloc_arr_tup(l_size, data_right)

    left_ind = 0
    right_ind = 0

    for left_ind in range(l_size):
        # restart right_ind if it went negative in a previous iteration
        if right_ind < 0:
            right_ind = 0

        # find last position in right whose value is less than left's
        while right_ind < r_size and right_key[right_ind] <= left_key[left_ind]:
            right_ind += 1

        right_ind -= 1

        out_left_key[left_ind] = left_key[left_ind]
        # TODO: copy_tup
        setitem_arr_tup(out_data_left, left_ind,
                        getitem_arr_tup(data_left, left_ind))

        if right_ind >= 0:
            out_right_key[left_ind] = right_key[right_ind]
            setitem_arr_tup(out_data_right, left_ind,
                            getitem_arr_tup(data_right, right_ind))
        else:
            setitem_arr_nan(out_right_key, left_ind)
            setitem_arr_tup_nan(out_data_right, left_ind)

    return out_left_key, out_right_key, out_data_left, out_data_right
예제 #4
0
def local_merge_new(left_key, right_key, data_left, data_right):
    curr_size = 101 + min(len(left_key), len(right_key)) // 10
    out_left_key = empty_like_type(curr_size, left_key)
    out_data_left = alloc_arr_tup(curr_size, data_left)
    out_data_right = alloc_arr_tup(curr_size, data_right)

    out_ind = 0
    left_ind = 0
    right_ind = 0

    while left_ind < len(left_key) and right_ind < len(right_key):
        if left_key[left_ind] == right_key[right_ind]:
            out_left_key = copy_elem_buff(out_left_key, out_ind, left_key[left_ind])
            l_data_val = getitem_arr_tup(data_left, left_ind)
            out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
            r_data_val = getitem_arr_tup(data_right, right_ind)
            out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)

            out_ind += 1
            left_run = left_ind + 1
            while left_run < len(left_key) and left_key[left_run] == right_key[right_ind]:
                out_left_key = copy_elem_buff(out_left_key, out_ind, left_key[left_run])
                l_data_val = getitem_arr_tup(data_left, left_run)
                out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
                r_data_val = getitem_arr_tup(data_right, right_ind)
                out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)

                out_ind += 1
                left_run += 1
            right_run = right_ind + 1
            while right_run < len(right_key) and right_key[right_run] == left_key[left_ind]:
                out_left_key = copy_elem_buff(out_left_key, out_ind, left_key[left_ind])
                l_data_val = getitem_arr_tup(data_left, left_ind)
                out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
                r_data_val = getitem_arr_tup(data_right, right_run)
                out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)

                out_ind += 1
                right_run += 1
            left_ind += 1
            right_ind += 1
        elif left_key[left_ind] < right_key[right_ind]:
            left_ind += 1
        else:
            right_ind += 1

    #out_left_key = out_left_key[:out_ind]
    out_left_key = trim_arr(out_left_key, out_ind)

    out_right_key = out_left_key.copy()
    out_data_left = trim_arr_tup(out_data_left, out_ind)
    out_data_right = trim_arr_tup(out_data_right, out_ind)

    return out_left_key, out_right_key, out_data_left, out_data_right
예제 #5
0
파일: join.py 프로젝트: stjordanis/hpat
def local_merge_new(left_keys, right_keys, data_left, data_right, is_left=False,
                                                               is_outer=False):
    l_len = len(left_keys[0])
    r_len = len(right_keys[0])
    # TODO: approximate output size properly
    curr_size = 101 + min(l_len, r_len) // 2
    if is_left:
        curr_size = int(1.1 * l_len)
    if is_outer:
        curr_size = int(1.1 * r_len)
    if is_left and is_outer:
        curr_size = int(1.1 * (l_len + r_len))

    out_left_key = alloc_arr_tup(curr_size, left_keys)
    out_data_left = alloc_arr_tup(curr_size, data_left)
    out_data_right = alloc_arr_tup(curr_size, data_right)

    out_ind = 0
    left_ind = 0
    right_ind = 0

    while left_ind < len(left_keys[0]) and right_ind < len(right_keys[0]):
        if getitem_arr_tup(left_keys, left_ind) == getitem_arr_tup(right_keys, right_ind):
            key = getitem_arr_tup(left_keys, left_ind)
            # catesian product in case of duplicate keys on either side
            left_run = left_ind
            while left_run < len(left_keys[0]) and getitem_arr_tup(left_keys, left_run) == key:
                right_run = right_ind
                while right_run < len(right_keys[0]) and getitem_arr_tup(right_keys, right_run) == key:
                    out_left_key = copy_elem_buff_tup(out_left_key, out_ind, key)
                    l_data_val = getitem_arr_tup(data_left, left_run)
                    out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
                    r_data_val = getitem_arr_tup(data_right, right_run)
                    out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)
                    out_ind += 1
                    right_run += 1
                left_run += 1
            left_ind = left_run
            right_ind = right_run
        elif getitem_arr_tup(left_keys, left_ind) < getitem_arr_tup(right_keys, right_ind):
            if is_left:
                out_left_key = copy_elem_buff_tup(out_left_key, out_ind, getitem_arr_tup(left_keys, left_ind))
                l_data_val = getitem_arr_tup(data_left, left_ind)
                out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
                out_data_right = setnan_elem_buff_tup(out_data_right, out_ind)
                out_ind += 1
            left_ind += 1
        else:
            if is_outer:
                # TODO: support separate keys?
                out_left_key = copy_elem_buff_tup(out_left_key, out_ind, getitem_arr_tup(right_keys, right_ind))
                out_data_left = setnan_elem_buff_tup(out_data_left, out_ind)
                r_data_val = getitem_arr_tup(data_right, right_ind)
                out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)
                out_ind += 1
            right_ind += 1

    if is_left and left_ind < len(left_keys[0]):
        while left_ind < len(left_keys[0]):
            out_left_key = copy_elem_buff_tup(out_left_key, out_ind, getitem_arr_tup(left_keys, left_ind))
            l_data_val = getitem_arr_tup(data_left, left_ind)
            out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
            out_data_right = setnan_elem_buff_tup(out_data_right, out_ind)
            out_ind += 1
            left_ind += 1

    if is_outer and right_ind < len(right_keys[0]):
        while right_ind < len(right_keys[0]):
            out_left_key = copy_elem_buff_tup(out_left_key, out_ind, getitem_arr_tup(right_keys, right_ind))
            out_data_left = setnan_elem_buff_tup(out_data_left, out_ind)
            r_data_val = getitem_arr_tup(data_right, right_ind)
            out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)
            out_ind += 1
            right_ind += 1

    #out_left_key = out_left_key[:out_ind]
    out_left_key = trim_arr_tup(out_left_key, out_ind)

    out_right_key = copy_arr_tup(out_left_key)
    out_data_left = trim_arr_tup(out_data_left, out_ind)
    out_data_right = trim_arr_tup(out_data_right, out_ind)

    return out_left_key, out_right_key, out_data_left, out_data_right
예제 #6
0
파일: join.py 프로젝트: stjordanis/hpat
 def _impl(right_keys, r_ind, l_key):
     r_key = getitem_arr_tup(right_keys, r_ind)
     if r_key != l_key:
         return -1
     return r_ind
예제 #7
0
파일: join.py 프로젝트: stjordanis/hpat
def local_hash_join_impl(left_keys, right_keys, data_left, data_right, is_left=False,
                                                               is_right=False):
    l_len = len(left_keys[0])
    r_len = len(right_keys[0])
    # TODO: approximate output size properly
    curr_size = 101 + min(l_len, r_len) // 2
    if is_left:
        curr_size = int(1.1 * l_len)
    if is_right:
        curr_size = int(1.1 * r_len)
    if is_left and is_right:
        curr_size = int(1.1 * (l_len + r_len))

    out_left_key = alloc_arr_tup(curr_size, left_keys)
    out_data_left = alloc_arr_tup(curr_size, data_left)
    out_data_right = alloc_arr_tup(curr_size, data_right)
    # keep track of matched keys in case of right join
    if is_right:
        r_matched = np.full(r_len, False, np.bool_)

    out_ind = 0
    m = hpat.dict_ext.multimap_int64_init()
    for i in range(r_len):
        # store hash if keys are tuple or non-int
        k = _hash_if_tup(getitem_arr_tup(right_keys, i))
        hpat.dict_ext.multimap_int64_insert(m, k, i)

    r = hpat.dict_ext.multimap_int64_equal_range_alloc()
    for i in range(l_len):
        l_key = getitem_arr_tup(left_keys, i)
        l_data_val = getitem_arr_tup(data_left, i)
        k = _hash_if_tup(l_key)
        hpat.dict_ext.multimap_int64_equal_range_inplace(m, k, r)
        num_matched = 0
        for j in r:
            # if hash for stored, check left key against the actual right key
            r_ind = _check_ind_if_hashed(right_keys, j, l_key)
            if r_ind == -1:
                continue
            if is_right:
                r_matched[r_ind] = True
            out_left_key = copy_elem_buff_tup(out_left_key, out_ind, l_key)
            r_data_val = getitem_arr_tup(data_right, r_ind)
            out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)
            out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
            out_ind += 1
            num_matched += 1
        if is_left and num_matched == 0:
            out_left_key = copy_elem_buff_tup(out_left_key, out_ind, l_key)
            out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val)
            out_data_right = setnan_elem_buff_tup(out_data_right, out_ind)
            out_ind += 1

    hpat.dict_ext.multimap_int64_equal_range_dealloc(r)

    # produce NA rows for unmatched right keys
    if is_right:
        for i in range(r_len):
            if not r_matched[i]:
                r_key = getitem_arr_tup(right_keys, i)
                r_data_val = getitem_arr_tup(data_right, i)
                out_left_key = copy_elem_buff_tup(out_left_key, out_ind, r_key)
                out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val)
                out_data_left = setnan_elem_buff_tup(out_data_left, out_ind)
                out_ind += 1

    out_left_key = trim_arr_tup(out_left_key, out_ind)

    out_right_key = copy_arr_tup(out_left_key)
    out_data_left = trim_arr_tup(out_data_left, out_ind)
    out_data_right = trim_arr_tup(out_data_right, out_ind)

    return out_left_key, out_right_key, out_data_left, out_data_right
예제 #8
0
파일: shuffle_utils.py 프로젝트: rowhit/sdc
def getitem_arr_tup_single_overload(arrs, i):
    if len(arrs.types) == 1:
        return lambda arrs, i: arrs[0][i]
    return lambda arrs, i: getitem_arr_tup(arrs, i)