def test_validity_add(nelem): # LHS lhs_data = np.random.random(nelem) lhs_mask = utils.random_bitmask(nelem) lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask) lhs_null_count = utils.count_zero(lhs_bitmask) lhs = Series.from_masked_array(lhs_data, lhs_mask, lhs_null_count) # RHS rhs_data = np.random.random(nelem) rhs_mask = utils.random_bitmask(nelem) rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask) rhs_null_count = utils.count_zero(rhs_bitmask) rhs = Series.from_masked_array(rhs_data, rhs_mask, rhs_null_count) # Result res = lhs + rhs res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool)[:nelem] # Fill NA values na_value = -10000 got = res.fillna(na_value).to_array() expect = lhs_data + rhs_data expect[~res_mask] = na_value # Check print('expect') print(expect) print('got') print(got) np.testing.assert_array_equal(expect, got)
def test_validity_add(nelem): np.random.seed(0) # LHS lhs_data = np.random.random(nelem) lhs_mask = utils.random_bitmask(nelem) lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem] lhs_null_count = utils.count_zero(lhs_bitmask) assert lhs_null_count >= 0 lhs = Series.from_masked_array(lhs_data, lhs_mask) assert lhs.null_count == lhs_null_count # RHS rhs_data = np.random.random(nelem) rhs_mask = utils.random_bitmask(nelem) rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem] rhs_null_count = utils.count_zero(rhs_bitmask) assert rhs_null_count >= 0 rhs = Series.from_masked_array(rhs_data, rhs_mask) assert rhs.null_count == rhs_null_count # Result res = lhs + rhs res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool)[:nelem] # Fill NA values na_value = -10000 got = res.fillna(na_value).to_array() expect = lhs_data + rhs_data expect[~res_mask] = na_value # Check print('expect') print(expect) print('got') print(got) np.testing.assert_array_equal(expect, got)
def test_validity_ceil(nelem): # Data data = np.random.random(nelem) * 100 mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask) null_count = utils.count_zero(bitmask) sr = Series.from_masked_array(data, mask, null_count) # Result res = sr.ceil() na_value = -100000 got = res.fillna(na_value).to_array() res_mask = np.asarray(bitmask, dtype=np.bool_)[:data.size] expect = np.ceil(data) expect[~res_mask] = na_value # Check print('expect') print(expect) print('got') print(got) np.testing.assert_array_equal(expect, got)
def test_dataframe_to_string(): with set_options(formatting={'nrows': 5, 'ncols': 8}): # Test basic df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) string = str(df) print(string) assert string.splitlines()[-1] == '[1 more rows]' # Test skipped columns df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16]), ('c', [11, 12, 13, 14, 15, 16]), ('d', [11, 12, 13, 14, 15, 16])]) string = df.to_string(ncols=3) print(string) assert string.splitlines()[-2] == '[1 more rows]' assert string.splitlines()[-1] == '[1 more columns]' # Test masked df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) data = np.arange(6) mask = np.zeros(1, dtype=np.uint8) mask[0] = 0b00101101 masked = Series.from_masked_array(data, mask) assert masked.null_count == 2 df['c'] = masked # check data values = list(masked) validids = [0, 2, 3, 5] densearray = masked.to_array() np.testing.assert_equal(data[validids], densearray) # valid position is corret for i in validids: assert data[i] == values[i] # null position is correct for i in range(len(values)): if i not in validids: assert values[i] is None got = df.to_string(nrows=None) print(got) expect = ''' a b c 0 1 11 0 1 2 12 2 3 13 2 3 4 14 3 4 5 15 5 6 16 5 ''' # values should match despite whitespace difference assert got.split() == expect.split()
def test_to_dense_array(): data = np.random.random(8) mask = np.asarray([0b11010110], dtype=np.byte) sr = Series.from_masked_array(data=data, mask=mask, null_count=3) assert sr.null_count > 0 assert sr.null_count != len(sr) filled = sr.to_array(fillna='pandas') dense = sr.to_array() assert dense.size < filled.size assert filled.size == len(sr)
def test_fillna(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] assert masked_col.null_count sr = Series.from_masked_array(data=masked_col.data, mask=masked_col.null, null_count=masked_col.null_count) dense = sr.fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert not dense.has_null_mask
def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = Series.from_masked_array(arr, Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique_k(k=10).to_array()) # test out of space arr = np.arange(10) sr = Series(arr) with pytest.raises(ValueError) as raises: sr.unique_k(k=7) raises.match('too many unique value')
def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = Series.from_masked_array(arr, Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique().to_array()) assert len(set(arr[mask])) == sr.unique_count() df = pd.DataFrame(data=arr[mask], columns=['col']) expect = df.col.value_counts().sort_index() got = sr.value_counts().to_pandas().sort_index() print(expect.head()) print(got.head()) assert got.equals(expect)
def test_series_reductions(method, dtype): np.random.seed(0) arr = np.random.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = arr > 10 else: mask = arr > 0.5 arr = arr.astype(dtype) arr2 = arr[mask] sr = Series.from_masked_array(arr, Series(mask).as_mask()) def call_test(sr): fn = getattr(sr, method) return fn() expect, got = call_test(arr2), call_test(sr) print(expect, got) np.testing.assert_approx_equal(expect, got)
def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = Series.from_masked_array(arr, Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique().to_array())