def tests_two_column_merge_left(left_nkeys=4, right_nkeys=5): """Test for issue #57. An issue that can trigger an error in cuda-memcheck. """ how = 'left' left_nrows = 60 right_nrows = 60 gdf_ctxt = new_context() libgdf.gdf_context_view(gdf_ctxt, 0, libgdf.GDF_HASH, 0) np.random.seed(0) # PyGDF left_cols = [ np.random.randint(0, left_nkeys, size=left_nrows), np.random.randint(0, left_nkeys, size=left_nrows), ] right_cols = [ np.random.randint(0, right_nkeys, size=right_nrows), np.random.randint(0, right_nkeys, size=right_nrows), ] with _make_input_multi(left_cols, right_cols, 2) as (col_left, col_right): joined_idx = _call_join_multi(libgdf.gdf_left_join, 2, col_left, col_right, gdf_ctxt) # Just check that the indices in `joined_idx` are valid assert joined_idx.shape[0] == 2 assert np.all(0 <= joined_idx[0]) assert np.all(-1 <= joined_idx[1]) assert np.all(joined_idx[0] < left_nrows) assert np.all(joined_idx[1] < right_nrows)
def quantile(column, quant, method, exact): """ Calculate the `quant` quantile for the column Returns value with the quantile specified by quant """ gdf_context = ffi.new('gdf_context*') method_api = _join_method_api['sort'] libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0) # libgdf.gdf_context_view(gdf_context, 0, method_api, 0) # px = ffi.new("double *") res = [] for q in quant: px = ffi.new("double *") if exact: libgdf.gdf_quantile_exact(column.cffi_view, get_quantile_method(method), q, ffi.cast('void *', px), gdf_context) else: libgdf.gdf_quantile_aprrox(column.cffi_view, q, ffi.cast('void *', px), gdf_context) res.append(px[0]) return res
def apply_join(col_lhs, col_rhs, how, method='hash'): """Returns a tuple of the left and right joined indices as gpu arrays. """ if (len(col_lhs) != len(col_rhs)): msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'" raise ValueError(msg) joiner = _join_how_api[how] method_api = _join_method_api[method] gdf_context = ffi.new('gdf_context*') if method == 'hash': libgdf.gdf_context_view(gdf_context, 0, method_api, 0) elif method == 'sort': libgdf.gdf_context_view(gdf_context, 1, method_api, 0) else: msg = "method not supported" raise ValueError(msg) col_result_l = columnview(0, None, dtype=np.int32) col_result_r = columnview(0, None, dtype=np.int32) if (how in ['left', 'inner']): list_lhs = [] list_rhs = [] for i in range(len(col_lhs)): list_lhs.append(col_lhs[i].cffi_view) list_rhs.append(col_rhs[i].cffi_view) # Call libgdf joiner(len(col_lhs), list_lhs, list_rhs, col_result_l, col_result_r, gdf_context) else: joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, col_result_l, col_result_r) # Extract result # yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary)) left = _as_numba_devarray(intaddr=int( ffi.cast("uintptr_t", col_result_l.data)), nelem=col_result_l.size, dtype=np.int32) right = _as_numba_devarray(intaddr=int( ffi.cast("uintptr_t", col_result_r.data)), nelem=col_result_r.size, dtype=np.int32) yield (left, right) libgdf.gdf_column_free(col_result_l) libgdf.gdf_column_free(col_result_r)
def test_leftjoin(dtype, join_type): # Make data left = np.array([[0, 0, 4, 5, 5]], dtype=dtype) right = np.array([[0, 0, 2, 3, 5]], dtype=dtype) gdf_ctxt = new_context() libgdf.gdf_context_view(gdf_ctxt, (join_type == libgdf.GDF_SORT), join_type, 0) with _make_input_multi(left, right, 1) as (col_left, col_right): # Join joined_idx = _call_join_multi(libgdf.gdf_left_join, 1, col_left, col_right, gdf_ctxt) # Check answer # Can be generated by: # In [75]: df = pd.DataFrame() # In [76]: df['a'] = list(range(5)) # In [77]: df1 = df.set_index(np.array([0, 0, 4, 5, 5])) # In [78]: df2 = df.set_index(np.array([0, 0, 2, 3, 5])) # In [79]: df1.join(df2, lsuffix='_left', rsuffix='_right', how='left') # Out[79]: # a_left a_right # 0 0 0.0 # 0 0 1.0 # 0 1 0.0 # 0 1 1.0 # 4 2 NaN # 5 3 4.0 # 5 4 4.0 left_pos, right_pos = joined_idx left_idx = [left[0][a] for a in left_pos] right_idx = [right[0][b] if b != -1 else None for b in right_pos] print(left_idx) print(right_idx) # sort before checking since the hash join may produce results in random order left_idx = sorted(left_idx) assert tuple(left_idx) == (0, 0, 0, 0, 4, 5, 5) # sort wouldn't work for nans #assert tuple(right_idx) == (0, 0, 0, 0, None, 5, 5) # sort before checking since the hash join may produce results in random order tmp = sorted(zip(left_pos, right_pos), key=lambda pair: (pair[0], pair[1])) left_pos = [x for x, _ in tmp] right_pos = [x for _, x in tmp] # left_pos == a_left assert tuple(left_pos) == (0, 0, 1, 1, 2, 3, 4) # right_pos == a_right assert tuple(right_pos) == (0, 1, 0, 1, -1, 4, 4)
def test_innerjoin(dtype, join_type): # Make data left = np.array([[0, 0, 1, 2, 3]], dtype=dtype) right = np.array([[0, 1, 2, 2, 3]], dtype=dtype) # left = np.array([44, 47, 0, 3, 3, 39, 9, 19, 21, 36, 23, 6, 24, 24, 12, 1, 38, 39, 23, 46, 24, 17, 37, 25, 13, 8, 9, 20, 16, 5, 15, 47, 0, 18, 35, 24, 49, 29, 19, 19, 14, 39, 32, 1, 9, 32, 31, 10, 23, 35, 11, 28, 34, 0, 0, 36, 5, 38, 40, 17, 15, 4, 41, 42, 31, 1, 1, 39, 41, 35, 38, 11, 46, 18, 27, 0, 14, 35, 12, 42, 20, 11, 4, 6, 4, 47, 3, 12, 36, 40, 14, 15, 20, 35, 23, 15, 13, 21, 48, 49], dtype=dtype) # right = np.array([5, 41, 35, 0, 31, 5, 30, 0, 49, 36, 34, 48, 29, 3, 34, 42, 13, 48, 39, 21, 9, 0, 10, 43, 23, 2, 34, 35, 30, 3, 18, 46, 35, 20, 17, 27, 14, 41, 1, 36, 10, 22, 43, 40, 11, 2, 16, 32, 0, 38, 19, 46, 42, 40, 13, 30, 24, 2, 3, 30, 34, 43, 13, 48, 40, 8, 19, 31, 8, 26, 2, 3, 44, 14, 32, 4, 3, 45, 11, 22, 13, 45, 11, 16, 24, 29, 21, 46, 25, 16, 19, 33, 40, 32, 36, 6, 21, 31, 13, 7], dtype=dtype) gdf_ctxt = new_context() libgdf.gdf_context_view(gdf_ctxt, (join_type == libgdf.GDF_SORT), join_type, 0) with _make_input_multi(left, right, 1) as (col_left, col_right): # Join joined_idx = _call_join_multi(libgdf.gdf_inner_join, 1, col_left, col_right, gdf_ctxt) print(joined_idx) # Check answer # Can be generated by: # In [56]: df = pd.DataFrame() # In [57]: df = pd.DataFrame() # In [58]: df['a'] = list(range(5)) # In [59]: df1 = df.set_index(np.array([0, 0, 1, 2, 3])) # In [60]: df2 = df.set_index(np.array([0, 1, 2, 2, 3])) # In [61]: df1.join(df2, lsuffix='_left', rsuffix='_right', how='inner') # Out[61]: # a_left a_right # 0 0 0 # 0 1 0 # 1 2 1 # 2 3 2 # 2 3 3 # 3 4 4 left_pos, right_pos = joined_idx left_idx = left[0][left_pos] right_idx = right[0][right_pos] assert list(left_idx) == list(right_idx) # sort before checking since the hash join may produce results in random order tmp = sorted(zip(left_pos, right_pos), key=lambda pair: (pair[0], pair[1])) left_pos = [x for x, _ in tmp] right_pos = [x for _, x in tmp] # left_pos == a_left assert tuple(left_pos) == (0, 1, 2, 3, 3, 4) # right_pos == a_right assert tuple(right_pos) == (0, 0, 1, 2, 3, 4)
def apply_join(col_lhs, col_rhs, how, method='hash'): """Returns a tuple of the left and right joined indices as gpu arrays. """ if(len(col_lhs) != len(col_rhs)): msg = "Unequal #columns in list 'col_lhs' and list 'col_rhs'" raise ValueError(msg) joiner = _join_how_api[how] join_result_ptr = ffi.new("gdf_join_result_type**", None) method_api = _join_method_api[method] gdf_context = ffi.new('gdf_context*') if method == 'hash': libgdf.gdf_context_view(gdf_context, 0, method_api, 0) elif method == 'sort': libgdf.gdf_context_view(gdf_context, 1, method_api, 0) else: msg = "method not supported" raise ValueError(msg) if(how in ['left', 'inner']): list_lhs = [] list_rhs = [] for i in range(len(col_lhs)): list_lhs.append(col_lhs[i].cffi_view) list_rhs.append(col_rhs[i].cffi_view) # Call libgdf joiner(len(col_lhs), list_lhs, list_rhs, join_result_ptr, gdf_context) else: joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, join_result_ptr) # Extract result join_result = join_result_ptr[0] dataptr = libgdf.gdf_join_result_data(join_result) datasize = libgdf.gdf_join_result_size(join_result) ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)), nelem=datasize, dtype=np.int32) ary = ary.reshape(2, datasize // 2) yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary)) libgdf.gdf_join_result_free(join_result)
def test_multileftjoin(dtype): # Make data left = np.array([[0, 0, 4, 5, 5], [1, 2, 2, 3, 4], [1, 1, 3, 1, 2]], dtype=dtype) right = np.array([[0, 0, 2, 3, 5], [1, 2, 3, 3, 4], [3, 3, 2, 1, 1]], dtype=dtype) gdf_ctxt = new_context() libgdf.gdf_context_view(gdf_ctxt, 0, libgdf.GDF_HASH, 0) for k in range(3): with _make_input_multi(left, right, k + 1) as (col_left, col_right): # Join joined_idx = _call_join_multi(libgdf.gdf_left_join, k + 1, col_left, col_right, gdf_ctxt) # Check answer # Can be generated by: # >>> df = pd.DataFrame() # >>> df2 = pd.DataFrame() # >>> df['a'] = [0, 0, 4, 5, 5] # >>> df2['a'] = [0, 0, 2, 3, 5] # >>> df['b'] = [1, 2, 2, 3,4] # >>> df2['b'] = [1, 2, 3, 3,4] # >>> df['c'] = [1, 1, 3, 1, 2] # >>> df2['c'] = [3, 3, 2, 1, 1] # >>> joined = df.merge(df2, how='left', on=['a'], suffixes=['_remove', '']) # >>> joined = df.merge(df2, how='left', on=['a'], suffixes=['_remove', '']) # >>> joined # a b_remove c_remove b c # 0 0 1 1 1.0 3.0 # 1 0 1 1 2.0 3.0 # 2 0 2 1 1.0 3.0 # 3 0 2 1 2.0 3.0 # 4 4 2 3 NaN NaN # 5 5 3 1 4.0 1.0 # 6 5 4 2 4.0 1.0 # >>> joined = df.merge(df2, how='left', on=['a','b'], suffixes=['_remove', '']) # >>> joined # a b c_remove c # 0 0 1 1 3.0 # 1 0 2 1 3.0 # 2 4 2 3 NaN # 3 5 3 1 NaN # 4 5 4 2 1.0 # >>> joined = df.merge(df2, how='left', suffixes=['_remove', '']) # >>> joined # a b c # 0 0 1 1 # 1 0 2 1 # 2 4 2 3 # 3 5 3 1 # 4 5 4 2 left_pos, right_pos = joined_idx # sort before checking since the hash join may produce results in random order tmp = sorted(zip(left_pos, right_pos), key=lambda pair: (pair[0], pair[1])) left_pos = [x for x, _ in tmp] right_pos = [x for _, x in tmp] if (k == 0): assert tuple(left_pos) == (0, 0, 1, 1, 2, 3, 4) assert tuple(right_pos) == (0, 1, 0, 1, -1, 4, 4) left_idx = [left[0][a] for a in left_pos] assert tuple(left_idx) == (0, 0, 0, 0, 4, 5, 5) elif (k == 1): assert tuple(left_pos) == (0, 1, 2, 3, 4) for l in range(2): left_idx = [left[l][a] for a in left_pos] if (l == 0): assert tuple(left_idx) == (0, 0, 4, 5, 5) elif (l == 1): assert tuple(left_idx) == (1, 2, 2, 3, 4) elif (k == 2): assert tuple(left_pos) == (0, 1, 2, 3, 4) for l in range(3): left_idx = [left[l][a] for a in left_pos] if (l == 0): assert tuple(left_idx) == (0, 0, 4, 5, 5) elif (l == 1): assert tuple(left_idx) == (1, 2, 2, 3, 4) elif (l == 2): assert tuple(left_idx) == (1, 1, 3, 1, 2)
def libgdf_join(col_lhs, col_rhs, on, how, method='sort'): joiner = _join_how_api[how] method_api = _join_method_api[method] gdf_context = ffi.new('gdf_context*') libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0) if how not in ['left', 'inner', 'outer']: msg = "new join api only supports left or inner" raise ValueError(msg) list_lhs = [] list_rhs = [] result_cols = [] result_col_names = [] left_idx = [] right_idx = [] # idx = 0 for name, col in col_lhs.items(): list_lhs.append(col._column.cffi_view) if name not in on: result_cols.append(columnview(0, None, dtype=col._column.dtype)) result_col_names.append(name) for name in on: result_cols.append(columnview(0, None, dtype=col_lhs[name]._column.dtype)) result_col_names.append(name) left_idx.append(list(col_lhs.keys()).index(name)) right_idx.append(list(col_rhs.keys()).index(name)) for name, col in col_rhs.items(): list_rhs.append(col._column.cffi_view) if name not in on: result_cols.append(columnview(0, None, dtype=col._column.dtype)) result_col_names.append(name) num_cols_to_join = len(on) result_num_cols = len(list_lhs) + len(list_rhs) - num_cols_to_join joiner(list_lhs, len(list_lhs), left_idx, list_rhs, len(list_rhs), right_idx, num_cols_to_join, result_num_cols, result_cols, ffi.NULL, ffi.NULL, gdf_context) res = [] valids = [] for col in result_cols: intaddr = int(ffi.cast("uintptr_t", col.data)) res.append(rmm.device_array_from_ptr(ptr=intaddr, nelem=col.size, dtype=gdf_to_np_dtype(col.dtype), finalizer=rmm._make_finalizer( intaddr, 0))) intaddr = int(ffi.cast("uintptr_t", col.valid)) valids.append(rmm.device_array_from_ptr(ptr=intaddr, nelem=calc_chunk_size( col.size, mask_bitsize), dtype=mask_dtype, finalizer=rmm._make_finalizer( intaddr, 0))) return res, valids