示例#1
0
def tests_two_column_merge_left(left_nkeys=4, right_nkeys=5):
    """Test for issue #57.
    An issue that can trigger an error in cuda-memcheck.
    """
    how = 'left'
    left_nrows = 60
    right_nrows = 60
    gdf_ctxt = new_context()
    libgdf.gdf_context_view(gdf_ctxt, 0, libgdf.GDF_HASH, 0)

    np.random.seed(0)

    # PyGDF
    left_cols = [
        np.random.randint(0, left_nkeys, size=left_nrows),
        np.random.randint(0, left_nkeys, size=left_nrows),
    ]
    right_cols = [
        np.random.randint(0, right_nkeys, size=right_nrows),
        np.random.randint(0, right_nkeys, size=right_nrows),
    ]

    with _make_input_multi(left_cols, right_cols, 2) as (col_left, col_right):
        joined_idx = _call_join_multi(libgdf.gdf_left_join, 2, col_left,
                                      col_right, gdf_ctxt)

    # Just check that the indices in `joined_idx` are valid
    assert joined_idx.shape[0] == 2
    assert np.all(0 <= joined_idx[0])
    assert np.all(-1 <= joined_idx[1])
    assert np.all(joined_idx[0] < left_nrows)
    assert np.all(joined_idx[1] < right_nrows)
示例#2
0
文件: _gdf.py 项目: cuulee/cudf
def quantile(column, quant, method, exact):
    """ Calculate the `quant` quantile for the column
    Returns value with the quantile specified by quant
    """
    gdf_context = ffi.new('gdf_context*')
    method_api = _join_method_api['sort']
    libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0)
    # libgdf.gdf_context_view(gdf_context, 0, method_api, 0)
    # px = ffi.new("double *")
    res = []
    for q in quant:
        px = ffi.new("double *")
        if exact:
            libgdf.gdf_quantile_exact(column.cffi_view,
                                      get_quantile_method(method),
                                      q,
                                      ffi.cast('void *', px),
                                      gdf_context)
        else:
            libgdf.gdf_quantile_aprrox(column.cffi_view,
                                       q,
                                       ffi.cast('void *', px),
                                       gdf_context)
        res.append(px[0])
    return res
示例#3
0
def apply_join(col_lhs, col_rhs, how, method='hash'):
    """Returns a tuple of the left and right joined indices as gpu arrays.
    """
    if (len(col_lhs) != len(col_rhs)):
        msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'"
        raise ValueError(msg)

    joiner = _join_how_api[how]
    method_api = _join_method_api[method]
    gdf_context = ffi.new('gdf_context*')

    if method == 'hash':
        libgdf.gdf_context_view(gdf_context, 0, method_api, 0)
    elif method == 'sort':
        libgdf.gdf_context_view(gdf_context, 1, method_api, 0)
    else:
        msg = "method not supported"
        raise ValueError(msg)

    col_result_l = columnview(0, None, dtype=np.int32)
    col_result_r = columnview(0, None, dtype=np.int32)

    if (how in ['left', 'inner']):
        list_lhs = []
        list_rhs = []
        for i in range(len(col_lhs)):
            list_lhs.append(col_lhs[i].cffi_view)
            list_rhs.append(col_rhs[i].cffi_view)

        # Call libgdf

        joiner(len(col_lhs), list_lhs, list_rhs, col_result_l, col_result_r,
               gdf_context)
    else:
        joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, col_result_l,
               col_result_r)

    # Extract result

    # yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary))

    left = _as_numba_devarray(intaddr=int(
        ffi.cast("uintptr_t", col_result_l.data)),
                              nelem=col_result_l.size,
                              dtype=np.int32)

    right = _as_numba_devarray(intaddr=int(
        ffi.cast("uintptr_t", col_result_r.data)),
                               nelem=col_result_r.size,
                               dtype=np.int32)

    yield (left, right)

    libgdf.gdf_column_free(col_result_l)
    libgdf.gdf_column_free(col_result_r)
示例#4
0
def test_leftjoin(dtype, join_type):
    # Make data
    left = np.array([[0, 0, 4, 5, 5]], dtype=dtype)
    right = np.array([[0, 0, 2, 3, 5]], dtype=dtype)
    gdf_ctxt = new_context()
    libgdf.gdf_context_view(gdf_ctxt, (join_type == libgdf.GDF_SORT),
                            join_type, 0)
    with _make_input_multi(left, right, 1) as (col_left, col_right):
        # Join
        joined_idx = _call_join_multi(libgdf.gdf_left_join, 1, col_left,
                                      col_right, gdf_ctxt)
    # Check answer
    # Can be generated by:
    # In [75]: df = pd.DataFrame()
    # In [76]: df['a'] = list(range(5))
    # In [77]: df1 = df.set_index(np.array([0, 0, 4, 5, 5]))
    # In [78]: df2 = df.set_index(np.array([0, 0, 2, 3, 5]))
    # In [79]: df1.join(df2, lsuffix='_left', rsuffix='_right', how='left')
    # Out[79]:
    #    a_left  a_right
    # 0       0      0.0
    # 0       0      1.0
    # 0       1      0.0
    # 0       1      1.0
    # 4       2      NaN
    # 5       3      4.0
    # 5       4      4.0
    left_pos, right_pos = joined_idx
    left_idx = [left[0][a] for a in left_pos]
    right_idx = [right[0][b] if b != -1 else None for b in right_pos]
    print(left_idx)
    print(right_idx)

    # sort before checking since the hash join may produce results in random order
    left_idx = sorted(left_idx)
    assert tuple(left_idx) == (0, 0, 0, 0, 4, 5, 5)
    # sort wouldn't work for nans
    #assert tuple(right_idx) == (0, 0, 0, 0, None, 5, 5)

    # sort before checking since the hash join may produce results in random order
    tmp = sorted(zip(left_pos, right_pos), key=lambda pair: (pair[0], pair[1]))
    left_pos = [x for x, _ in tmp]
    right_pos = [x for _, x in tmp]
    # left_pos == a_left
    assert tuple(left_pos) == (0, 0, 1, 1, 2, 3, 4)
    # right_pos == a_right
    assert tuple(right_pos) == (0, 1, 0, 1, -1, 4, 4)
示例#5
0
def test_innerjoin(dtype, join_type):
    # Make data
    left = np.array([[0, 0, 1, 2, 3]], dtype=dtype)
    right = np.array([[0, 1, 2, 2, 3]], dtype=dtype)
    #    left = np.array([44, 47, 0, 3, 3, 39, 9, 19, 21, 36, 23, 6, 24, 24, 12, 1, 38, 39, 23, 46, 24, 17, 37, 25, 13, 8, 9, 20, 16, 5, 15, 47, 0, 18, 35, 24, 49, 29, 19, 19, 14, 39, 32, 1, 9, 32, 31, 10, 23, 35, 11, 28, 34, 0, 0, 36, 5, 38, 40, 17, 15, 4, 41, 42, 31, 1, 1, 39, 41, 35, 38, 11, 46, 18, 27, 0, 14, 35, 12, 42, 20, 11, 4, 6, 4, 47, 3, 12, 36, 40, 14, 15, 20, 35, 23, 15, 13, 21, 48, 49], dtype=dtype)
    #    right = np.array([5, 41, 35, 0, 31, 5, 30, 0, 49, 36, 34, 48, 29, 3, 34, 42, 13, 48, 39, 21, 9, 0, 10, 43, 23, 2, 34, 35, 30, 3, 18, 46, 35, 20, 17, 27, 14, 41, 1, 36, 10, 22, 43, 40, 11, 2, 16, 32, 0, 38, 19, 46, 42, 40, 13, 30, 24, 2, 3, 30, 34, 43, 13, 48, 40, 8, 19, 31, 8, 26, 2, 3, 44, 14, 32, 4, 3, 45, 11, 22, 13, 45, 11, 16, 24, 29, 21, 46, 25, 16, 19, 33, 40, 32, 36, 6, 21, 31, 13, 7], dtype=dtype)

    gdf_ctxt = new_context()
    libgdf.gdf_context_view(gdf_ctxt, (join_type == libgdf.GDF_SORT),
                            join_type, 0)
    with _make_input_multi(left, right, 1) as (col_left, col_right):
        # Join
        joined_idx = _call_join_multi(libgdf.gdf_inner_join, 1, col_left,
                                      col_right, gdf_ctxt)
    print(joined_idx)
    # Check answer
    # Can be generated by:
    # In [56]: df = pd.DataFrame()
    # In [57]: df = pd.DataFrame()
    # In [58]: df['a'] = list(range(5))
    # In [59]: df1 = df.set_index(np.array([0, 0, 1, 2, 3]))
    # In [60]: df2 = df.set_index(np.array([0, 1, 2, 2, 3]))
    # In [61]: df1.join(df2, lsuffix='_left', rsuffix='_right', how='inner')
    # Out[61]:
    #    a_left  a_right
    # 0       0        0
    # 0       1        0
    # 1       2        1
    # 2       3        2
    # 2       3        3
    # 3       4        4
    left_pos, right_pos = joined_idx
    left_idx = left[0][left_pos]
    right_idx = right[0][right_pos]

    assert list(left_idx) == list(right_idx)
    # sort before checking since the hash join may produce results in random order
    tmp = sorted(zip(left_pos, right_pos), key=lambda pair: (pair[0], pair[1]))
    left_pos = [x for x, _ in tmp]
    right_pos = [x for _, x in tmp]
    # left_pos == a_left
    assert tuple(left_pos) == (0, 1, 2, 3, 3, 4)
    # right_pos == a_right
    assert tuple(right_pos) == (0, 0, 1, 2, 3, 4)
示例#6
0
def apply_join(col_lhs, col_rhs, how, method='hash'):
    """Returns a tuple of the left and right joined indices as gpu arrays.
    """
    if(len(col_lhs) != len(col_rhs)):
        msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'"
        raise ValueError(msg)

    joiner = _join_how_api[how]
    join_result_ptr = ffi.new("gdf_join_result_type**", None)
    method_api = _join_method_api[method]
    gdf_context = ffi.new('gdf_context*')

    if method == 'hash':
        libgdf.gdf_context_view(gdf_context, 0, method_api, 0)
    elif method == 'sort':
        libgdf.gdf_context_view(gdf_context, 1, method_api, 0)
    else:
        msg = "method not supported"
        raise ValueError(msg)

    if(how in ['left', 'inner']):
        list_lhs = []
        list_rhs = []
        for i in range(len(col_lhs)):
            list_lhs.append(col_lhs[i].cffi_view)
            list_rhs.append(col_rhs[i].cffi_view)

        # Call libgdf

        joiner(len(col_lhs), list_lhs, list_rhs, join_result_ptr, gdf_context)
    else:
        joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, join_result_ptr)

    # Extract result
    join_result = join_result_ptr[0]
    dataptr = libgdf.gdf_join_result_data(join_result)
    datasize = libgdf.gdf_join_result_size(join_result)
    ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)),
                             nelem=datasize, dtype=np.int32)
    ary = ary.reshape(2, datasize // 2)
    yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary))
    libgdf.gdf_join_result_free(join_result)
示例#7
0
def test_multileftjoin(dtype):
    # Make data
    left = np.array([[0, 0, 4, 5, 5], [1, 2, 2, 3, 4], [1, 1, 3, 1, 2]],
                    dtype=dtype)
    right = np.array([[0, 0, 2, 3, 5], [1, 2, 3, 3, 4], [3, 3, 2, 1, 1]],
                     dtype=dtype)
    gdf_ctxt = new_context()
    libgdf.gdf_context_view(gdf_ctxt, 0, libgdf.GDF_HASH, 0)

    for k in range(3):
        with _make_input_multi(left, right, k + 1) as (col_left, col_right):
            # Join
            joined_idx = _call_join_multi(libgdf.gdf_left_join, k + 1,
                                          col_left, col_right, gdf_ctxt)

        # Check answer
        # Can be generated by:
        # >>> df = pd.DataFrame()
        # >>> df2 = pd.DataFrame()
        # >>> df['a'] = [0, 0, 4, 5, 5]
        # >>> df2['a'] = [0, 0, 2, 3, 5]
        # >>> df['b'] = [1, 2, 2, 3,4]
        # >>> df2['b'] = [1, 2, 3, 3,4]
        # >>> df['c'] = [1, 1, 3, 1, 2]
        # >>> df2['c'] = [3, 3, 2, 1, 1]
        # >>> joined = df.merge(df2, how='left', on=['a'], suffixes=['_remove', ''])
        # >>> joined = df.merge(df2, how='left', on=['a'], suffixes=['_remove', ''])
        # >>> joined
        # a  b_remove  c_remove    b    c
        # 0  0         1         1  1.0  3.0
        # 1  0         1         1  2.0  3.0
        # 2  0         2         1  1.0  3.0
        # 3  0         2         1  2.0  3.0
        # 4  4         2         3  NaN  NaN
        # 5  5         3         1  4.0  1.0
        # 6  5         4         2  4.0  1.0
        # >>> joined = df.merge(df2, how='left', on=['a','b'], suffixes=['_remove', ''])
        # >>> joined
        # a  b  c_remove    c
        # 0  0  1         1  3.0
        # 1  0  2         1  3.0
        # 2  4  2         3  NaN
        # 3  5  3         1  NaN
        # 4  5  4         2  1.0
        # >>> joined = df.merge(df2, how='left', suffixes=['_remove', ''])
        # >>> joined
        # a  b  c
        # 0  0  1  1
        # 1  0  2  1
        # 2  4  2  3
        # 3  5  3  1
        # 4  5  4  2

        left_pos, right_pos = joined_idx

        # sort before checking since the hash join may produce results in random order
        tmp = sorted(zip(left_pos, right_pos),
                     key=lambda pair: (pair[0], pair[1]))
        left_pos = [x for x, _ in tmp]
        right_pos = [x for _, x in tmp]

        if (k == 0):

            assert tuple(left_pos) == (0, 0, 1, 1, 2, 3, 4)
            assert tuple(right_pos) == (0, 1, 0, 1, -1, 4, 4)

            left_idx = [left[0][a] for a in left_pos]

            assert tuple(left_idx) == (0, 0, 0, 0, 4, 5, 5)

        elif (k == 1):

            assert tuple(left_pos) == (0, 1, 2, 3, 4)

            for l in range(2):
                left_idx = [left[l][a] for a in left_pos]

                if (l == 0):
                    assert tuple(left_idx) == (0, 0, 4, 5, 5)
                elif (l == 1):
                    assert tuple(left_idx) == (1, 2, 2, 3, 4)

        elif (k == 2):

            assert tuple(left_pos) == (0, 1, 2, 3, 4)
            for l in range(3):
                left_idx = [left[l][a] for a in left_pos]

                if (l == 0):
                    assert tuple(left_idx) == (0, 0, 4, 5, 5)
                elif (l == 1):
                    assert tuple(left_idx) == (1, 2, 2, 3, 4)
                elif (l == 2):
                    assert tuple(left_idx) == (1, 1, 3, 1, 2)
示例#8
0
文件: _gdf.py 项目: cuulee/cudf
def libgdf_join(col_lhs, col_rhs, on, how, method='sort'):
    joiner = _join_how_api[how]
    method_api = _join_method_api[method]
    gdf_context = ffi.new('gdf_context*')

    libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0)

    if how not in ['left', 'inner', 'outer']:
        msg = "new join api only supports left or inner"
        raise ValueError(msg)

    list_lhs = []
    list_rhs = []
    result_cols = []

    result_col_names = []

    left_idx = []
    right_idx = []
    # idx = 0
    for name, col in col_lhs.items():
        list_lhs.append(col._column.cffi_view)
        if name not in on:
            result_cols.append(columnview(0, None, dtype=col._column.dtype))
            result_col_names.append(name)

    for name in on:
        result_cols.append(columnview(0, None,
                                      dtype=col_lhs[name]._column.dtype))
        result_col_names.append(name)
        left_idx.append(list(col_lhs.keys()).index(name))
        right_idx.append(list(col_rhs.keys()).index(name))

    for name, col in col_rhs.items():
        list_rhs.append(col._column.cffi_view)
        if name not in on:
            result_cols.append(columnview(0, None, dtype=col._column.dtype))
            result_col_names.append(name)

    num_cols_to_join = len(on)
    result_num_cols = len(list_lhs) + len(list_rhs) - num_cols_to_join

    joiner(list_lhs,
           len(list_lhs),
           left_idx,
           list_rhs,
           len(list_rhs),
           right_idx,
           num_cols_to_join,
           result_num_cols,
           result_cols,
           ffi.NULL,
           ffi.NULL,
           gdf_context)

    res = []
    valids = []

    for col in result_cols:
        intaddr = int(ffi.cast("uintptr_t", col.data))
        res.append(rmm.device_array_from_ptr(ptr=intaddr,
                                             nelem=col.size,
                                             dtype=gdf_to_np_dtype(col.dtype),
                                             finalizer=rmm._make_finalizer(
                                                 intaddr, 0)))
        intaddr = int(ffi.cast("uintptr_t", col.valid))
        valids.append(rmm.device_array_from_ptr(ptr=intaddr,
                                                nelem=calc_chunk_size(
                                                    col.size, mask_bitsize),
                                                dtype=mask_dtype,
                                                finalizer=rmm._make_finalizer(
                                                    intaddr, 0)))

    return res, valids