示例#1
0
    def test_fill_forward(self):
        """
        Test that Categorical.fill_forward fills values forward *per group*.
        """
        data = rt.FA([1.0, 4.0, 9.0, 16.0, np.nan, np.nan])
        cat = rt.Categorical(['A', 'B', 'A', 'B', 'A', 'B'])

        result = cat.fill_forward(data)

        # The result of this function should be a Dataset.
        assert isinstance(result, rt.Dataset)

        # The dataset should have the same number of rows as the data arrays
        # we operated on (an invariant of apply_nonreduce/"scan"/"prefix sum").
        assert result.shape[0] == len(data)

        # The dataset should have (N+M) columns, where N is the number
        # of keys within the Categorical and M is the number of columns
        # we performed the operation on.
        expected_col_count = len(cat.category_dict) + 1
        assert result.shape[1] == expected_col_count

        # Check the resulting data; the dtype of the data should be the
        # same as the original column.
        assert_array_equal(result[0], rt.FA([1.0, 4.0, 9.0, 16.0, 9.0, 16.0]))
示例#2
0
    def test_sample(self):
        # Test Dataset.sample
        ds = rt.Dataset({'num': [1, 2, 3, 4, 5], 'str': ['ab', 'bc', 'cd', 'de', 'ef']})
        np.random.seed(1)
        ds_sample = ds.sample(3, rt.FA([True, True, True, False, True]))
        ds_sample_expected = rt.Dataset({'num': [1, 3, 5], 'str': ['ab', 'cd', 'ef']})
        assert (ds_sample_expected == ds_sample).all(axis=None)

        # Test FastArray.sample
        fa = rt.FA([1, 2, 3, 4, 5])
        np.random.seed(1)
        fa_sample = fa.sample(2, rt.FA([False, True, True, False, True]))
        fa_sample_expected = rt.FA([2, 5])
        assert (fa_sample_expected == fa_sample).all(axis=None)

        # Test overflow
        fa_sample = fa.sample(10, rt.FA([False, True, False, False, True]))
        fa_sample_expected = rt.FA([2, 5])
        assert (fa_sample_expected == fa_sample).all(axis=None)

        # Test no filter
        np.random.seed(1)
        fa_sample = fa.sample(2)
        fa_sample_expected = rt.FA([2, 3])
        assert (fa_sample_expected == fa_sample).all(axis=None)

        # Test fancy index
        np.random.seed(1)
        fa_sample = fa.sample(2, rt.FA([1, 3, 4]))
        fa_sample_expected = rt.FA([2, 5])
        assert (fa_sample_expected == fa_sample).all(axis=None)
示例#3
0
    def test_sample(self):
        # Test Dataset.sample
        ds = rt.Dataset({'num': [1, 2, 3, 4, 5], 'str': ['ab', 'bc', 'cd', 'de', 'ef']})
        ds_sample = ds.sample(3, rt.FA([True, True, True, False, True]), seed=1)
        ds_sample_expected = rt.Dataset({'num': [1, 2, 5], 'str': ['ab', 'bc', 'ef']})
        assert ds_sample.keys() == ds_sample_expected.keys()
        for col_name in ds_sample_expected.keys():
            assert_array_equal(ds_sample_expected[col_name], ds_sample[col_name], err_msg=f"Column '{col_name}' differs.")

        # Test FastArray.sample
        fa = rt.FA([1, 2, 3, 4, 5])
        fa_sample = fa.sample(2, rt.FA([False, True, True, False, True]), seed=1)
        fa_sample_expected = rt.FA([2, 3])
        assert_array_equal(fa_sample_expected, fa_sample)

        # Test overflow
        fa_sample = fa.sample(10, rt.FA([False, True, False, False, True]), seed=1)
        fa_sample_expected = rt.FA([2, 5])
        assert_array_equal(fa_sample_expected, fa_sample)

        # Test no filter
        fa_sample = fa.sample(2, seed=1)
        fa_sample_expected = rt.FA([2, 3])
        assert_array_equal(fa_sample_expected, fa_sample)

        # Test fancy index
        fa_sample = fa.sample(2, rt.FA([1, 3, 4]), seed=1)
        fa_sample_expected = rt.FA([2, 4])
        assert_array_equal(fa_sample_expected, fa_sample)
示例#4
0
    def test_index_any_of(self):
        result = FAString(SYMBOLS).index_any_of('PZG')
        expected = rt.FA([2, 2, -1, 0, -1])
        assert_array_equal(result, expected)

        result = FAString(NB_PARALLEL_SYMBOLS).index_any_of('PZG')
        expected = rt.FA([2, 2, -1, 0, -1] * PARALLEL_MULTIPLIER)
        assert_array_equal(result, expected)

        # test old alias
        result = FAString(SYMBOLS).strpbrk('PZG')
        expected = rt.FA([2, 2, -1, 0, -1])
        assert_array_equal(result, expected)
示例#5
0
    def test_basicmath_two_inputs(self):
        arg0 = rt.FA([1, 2, 3, 4, 5]).tile(40)
        arg1 = np.array(1, dtype=np.int64)
        tupleargs = (arg0, arg1)

        final_num = 7
        fastfunction = rt.rt_enum.MATH_OPERATION.CMP_EQ

        # Try the operation.
        result = rc.BasicMathTwoInputs(tupleargs, fastfunction, final_num)

        assert_array_equal(result,
                           rt.FA([True, False, False, False, False]).tile(40))
示例#6
0
    def test_take_groups(self):
        ## Case 1: Basic operation.
        # Create a grouping from some data.
        key_data1 = rt.FA([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6])
        g1 = rt.Grouping(key_data1)

        # Create another data array the same length as the key for the Grouping.
        data1 = rt.arange(len(key_data1))

        # Extract elements from the data array where they correspond to even-numbered groups.
        result1 = Grouping.take_groups(data1, rt.FA([2, 4, 6]), g1.ncountgroup, g1.ifirstgroup)

        assert_array_equal(rt.FA([1, 2, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20]), result1)
class TestPyarrowConvertFastArray:
    @pytest.mark.parametrize(('rt_farr',), [
        pytest.param(rt.FA([], dtype=np.int8), id='empty(int8)'),
        pytest.param(rt.FA([-120, rt.int8.inv, -1, 0, 1, 101, 127], dtype=np.int8), id='int8'),
        pytest.param(rt.FA([0.01, -0.0, np.nan, 1e10, -1e10, np.inf, np.pi], dtype=np.float32), id='float32'),
        # bool
        pytest.param(rt.FA([b'ABC', b'abcde'], dtype='S'), id='ascii'),
        #FAILS# pytest.param(rt.FA(['ABC', 'abcde'], unicode=True), id='unicode'),
        pytest.param(rt.FA(['A\u1F600C', 'abcde'], unicode=True), id='unicode 2'),
    ])
    def test_roundtrip_rt_pa_rt(self, rt_farr: rt.FastArray) -> None:
        """Test round-tripping from rt.FastArray to pyarrow.Array and back."""
        result_pa_arr = rt_farr.to_arrow()
        result_farr = rt.FastArray.from_arrow(result_pa_arr, zero_copy_only=False)
        assert_array_equal(rt_farr, result_farr)

    @pytest.mark.parametrize(('rt_farr',), [
        pytest.param(rt.FA([b'ABC', b'abcde'], dtype='S'), id='ascii'),
        pytest.param(rt.FA(['ABC', 'abcde'], unicode=True), id='unicode'),
        pytest.param(rt.FA(['A\u1F600C', 'abcde'], unicode=True), id='unicode 2'),
    ])
    def test_rt_pa_str(self, rt_farr: rt.FastArray) -> None:
        """Test round-tripping from rt.FastArray to pyarrow.Array and back."""
        result_pa_arr = rt_farr.to_arrow()
        result_list = result_pa_arr.to_pylist()
        assert len(rt_farr) == len(result_list)
        for i in range(len(rt_farr)):
            str_farr = rt_farr[i]
            str_result = result_list[i]
            assert len(str_farr) == len(str_result)
示例#8
0
    def test_mbget_bytes_int64(self) -> None:
        arg0 = rt.FA(['x', 'y', 'z', 'q', 'w', 't'])
        arg1 = rt.FA([
            rt.int64.inv, rt.int64.inv, rt.int64.inv, 0, 3, rt.int64.inv, 1, 4,
            rt.int64.inv, 2, 5, rt.int64.inv
        ],
                     dtype=np.int64)

        # Try the operation
        result = rc.MBGet(arg0, arg1)
        arr_inv = arg0.inv
        expected = rt.FA([
            arr_inv, arr_inv, arr_inv, b'x', b'q', arr_inv, b'y', b'w',
            arr_inv, b'z', b't', arr_inv
        ])
        assert_array_equal(result, expected)
示例#9
0
    def test_nan_awareness(self, arr, func_type):
        """
        Check how :func:`rt.nansum` handles NaN values by comparing it against :func:`rt.sum`.

        Call :func:`rt.nansum` with an array, then remove any NaNs from the array and call
        :func:`np.sum` with the 'clean' array. The results should match.
        """
        # Get the function implementation based on how we want to call it.
        if func_type == 'module':
            test_func = lambda x: rt.nansum(x)
        elif func_type == 'member':
            test_func = lambda x: x.nansum()
        else:
            raise ValueError(
                f"Unhandled value '{func_type}' specified for the function type."
            )

        # Get the nan-unaware version of the function.
        nan_unaware_func = lambda x: rt.sum(x)

        # Wrap the input as a FastArray to ensure we'll get the riptable implementation of the function.
        arr = rt.FA(arr)

        # Call the test implementation.
        NanAwareTestImpl.test_nan_awareness(test_func, nan_unaware_func, arr)
class TestPyarrowConvertDataset:
    @pytest.mark.parametrize(('rt_dset',), [
        pytest.param(rt.Dataset({}), id='empty'),
        pytest.param(rt.Dataset({
            'ink_capacity': rt.FA([15, 10, 15, 25, 10, 15, 25, 15]),
            'purchase_date': rt.Date(['2019-06-19', '2019-06-19', '2020-01-15', '2020-05-22', '2020-02-10', '2020-02-10', '2020-03-17', '2020-03-17']),
            'country_code': rt.Categorical(
                # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats.
                [36, 36, 344, 840, 840, 124, 36, 484],
                {
                    'IRL': 372, 'USA': 840, 'AUS': 36, 'HKG': 344, 'JPN': 392,
                    'MEX': 484, 'KHM': 116, 'THA': 764, 'JAM': 388, 'ARM': 51
                }, ordered=True)
            })
        )
    ])
    def test_roundtrip_rt_pa_rt(self, rt_dset: rt.Dataset) -> None:
        """Test round-tripping from rt.Dataset to pyarrow.Table and back."""
        result_pa_tbl = rt_dset.to_arrow()
        result_rt_dset = rt.Dataset.from_arrow(result_pa_tbl, zero_copy_only=False)

        assert rt_dset.keys() == result_rt_dset.keys()
        for col_name in rt_dset.keys():
            # relaxed_cat_check=True, because we're not trying to test specific details of Categorical conversion
            # here, we're more interested in the dataset-level stuff.
            assert_array_or_cat_equal(rt_dset[col_name], result_rt_dset[col_name], relaxed_cat_check=True)
示例#11
0
    def test_mbget_int32_int64(self) -> None:
        arg0 = rt.FA([1, 2, 3, 4, 5, 6], dtype=np.int32)
        arg1 = rt.FA([
            -9223372036854775808, -9223372036854775808, -9223372036854775808,
            0, 3, -9223372036854775808, 1, 4, -9223372036854775808, 2, 5,
            -9223372036854775808
        ],
                     dtype=np.int64)

        # Try the operation
        result = rc.MBGet(arg0, arg1)
        arr_inv = arg0.inv
        expected = rt.FA([
            arr_inv, arr_inv, arr_inv, 1, 4, arr_inv, 2, 5, arr_inv, 3, 6,
            arr_inv
        ])
        assert_array_equal(result, expected)
示例#12
0
def test_mbget_no_default_uses_invalid():
    data = np.arange(start=3, stop=53, dtype=np.int8).view(rt.FA)
    indices = rt.FA([0, 25, -40, 17, 100, -80, 50, -51, 35])

    valid_indices = np.logical_and(indices >= -50, indices < 50)

    # Call the 'mbget' function without providing an explicit default value.
    result = rt.mbget(data, indices)

    # The resulting array should have the same dtype as the values/data array.
    assert data.dtype == result.dtype, "The result has a different dtype than the values/data array."

    # The elements with out-of-bounds indices should have been assigned the
    # riptable NA/sentinel value because a default value was not explicitly specified.
    assert_array_equal(valid_indices, rt.isnotnan(result))

    # Check that the valid indices fetched the correct values.
    assert_array_equal(rt.FA([3, 28, 13, 20, 38]), result[valid_indices])
示例#13
0
    def test_extract_groups_all_groups_off(self):
        """Test for Grouping.extract_groups() when given a condition mask with all values set to False."""

        # Create a grouping from some data.
        key_data1 = rt.FA([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6])
        g1 = rt.Grouping(key_data1)

        # Create another data array the same length as the key for the Grouping.
        data1 = rt.arange(len(key_data1))

        # Create a condition mask with all values set to False.
        group_mask1 = rt.zeros(len(g1.ncountgroup), dtype=np.bool)

        # Extract elements from the data array with all groups masked out -- i.e. we're trying
        # to select data from none of the groups.
        result1 = Grouping.extract_groups(group_mask1, data1, g1.ncountgroup, g1.ifirstgroup)

        assert_array_equal(rt.FA([]), result1)
示例#14
0
def test_mbget_with_explicit_default():
    data = np.arange(start=3, stop=53, dtype=np.int8).view(rt.FA)
    indices = rt.FA([0, 25, -40, 17, 100, -80, 50, -51, 35])

    valid_indices = np.logical_and(indices >= -50, indices < 50)

    # Call the 'mbget' function, providing an explicit default value.
    default_value = 123
    result = rt.mbget(data, indices, d=default_value)

    # The resulting array should have the same dtype as the values/data array.
    assert data.dtype == result.dtype, "The result has a different dtype than the values/data array."

    # The elements with out-of-bounds indices should have been assigned the
    # explicitly-specified default value.
    assert_array_equal(valid_indices, result != default_value)

    # Check that the valid indices fetched the correct values.
    assert_array_equal(rt.FA([3, 28, 13, 20, 38]), result[valid_indices])
示例#15
0
def test_mbget_with_too_large_explicit_default():
    data = np.arange(start=3, stop=53, dtype=np.int8).view(rt.FA)
    indices = rt.FA([0, 25, -40, 17, 100, -80, 50, -51, 35])

    valid_indices = np.logical_and(indices >= -50, indices < 50)

    # Call the 'mbget' function, providing an explicit default value
    # which is too large to be represented by the dtype of the data/values array.
    default_value = 1234
    result = rt.mbget(data, indices, d=default_value)

    # The resulting array will need to have a larger dtype than the original data array
    # to accommodate the explicit default value that was too large for the data's dtype.
    assert data.dtype != result.dtype, "The result has the same dtype as the values/data array."
    assert np.dtype(data.dtype).itemsize < np.dtype(result.dtype).itemsize, "The result dtype is not larger than the values/data dtype."

    # The elements with out-of-bounds indices should have been assigned the
    # explicitly-specified default value.
    assert_array_equal(valid_indices, result != default_value)

    # Check that the valid indices fetched the correct values.
    assert_array_equal(rt.FA([3, 28, 13, 20, 38]), result[valid_indices])
示例#16
0
    def test_accum_cols_noncat(self):
        num_rows = 10
        pointer = rt.FA([0, 1] * int(num_rows / 2))
        count = rt.full(num_rows, 1.0)

        accum = rt.accum_cols(pointer, count)
        accum_expected = rt.Dataset({'YLabel': [0, 1], 'col0': [5.0, 5.0]})
        accum_expected.footer_set_values('Total', {
            'YLabel': 'Total',
            'col0': 10.0
        })

        self.assertTrue((accum == accum_expected).all(axis=None))
示例#17
0
    def test_save_load_array(self, arr, tmpdir):
        # Test #1: save and load of ndarray
        fn = str(tmpdir.join(name(arr)))
        save_sds(fn, arr)
        arr2 = load_sds(fn)

        assert_save_load(arr2, arr)
        assert_array_equal_(arr2, arr)

        # Test #2: save and load of FastArray derived from ndarray
        f_arr = rt.FA(arr)
        save_sds(fn, f_arr)
        f_arr2 = load_sds(fn)

        assert_array_equal_(f_arr2, f_arr)
def test_boolean_indexing(input, bools, expected):
    # we want contig bool array
    f = np.array(bools)
    if not f.flags.f_contiguous:
        f = np.asfortranarray(bools)
    assert (f.flags.f_contiguous)

    na = np.asfortranarray(input)
    assert (na.flags.f_contiguous)
    assert_array_equal(na[f], expected)

    fa = rt.FA(na)
    assert (fa.flags.f_contiguous)
    fr = fa[f]
    assert_array_equal(super(rt.FA, fr),
                       expected)  # use ndarray element indexing for assertion
示例#19
0
def test_merge_cats_stringcat_with_empty():
    cat_lens = [
        12,
        7,
    ]  # Arbitrary; specify the lengths of our test Cats here so we can re-use the lengths below for consistency.

    indices = np.hstack([np.full(cat_lens[0], 1), np.full(cat_lens[1], 2)])
    listcats = [rt.FA([b'2019/12/21', b'', b'2019/12/21'], dtype='|S32')]
    idx_cutoffs = np.cumsum(cat_lens)
    uniques_cutoffs = [np.array([1, 3], dtype=np.int64)]
    assert len(listcats) == len(uniques_cutoffs)

    fixed_indices, stacked_uniques = rt.merge_cats(
        indices, listcats, idx_cutoffs=idx_cutoffs, unique_cutoffs=uniques_cutoffs
    )

    assert len(stacked_uniques[0]) == 2
示例#20
0
    def test_isnan_implies_nan_result(self, arr, func_type):
        """
        Check how :func:`rt.max` handles NaN values.

        One or more NaNs in the input array should result in the function returning a NaN.
        """
        # Get the function implementation based on how we want to call it.
        if func_type == 'module':
            test_func = lambda x: rt.max(x)
        elif func_type == 'member':
            test_func = lambda x: x.max()
        else:
            raise ValueError(
                f"Unhandled value '{func_type}' specified for the function type."
            )

        # Wrap the input as a FastArray to ensure we'll get the riptable implementation of the function.
        arr = rt.FA(arr)

        # Call the test implementation.
        NanUnawareTestImpl.test_isnan_implies_nan_result(test_func, arr)
示例#21
0
    def test_save_load_dataset_array(self, arr, tmpdir):
        # Test #1: save and load of ndarray within Dataset
        fn = str(tmpdir.join(name(arr)))

        ds = rt.Dataset({name(arr): arr})

        ds.save(fn)
        ds2 = rt.Dataset.load(fn)

        assert_save_load(ds2, ds)
        assert_array_equal_(ds2[name(arr)], ds[name(arr)])

        # Test #2: save and load of FastArray derived from ndarray within Dataset
        f_arr = rt.FA(arr)
        fn = str(tmpdir.join(name(f_arr)))

        ds = rt.Dataset({name(f_arr): f_arr})
        ds.save(fn)
        ds2 = rt.Dataset.load(fn)

        assert_save_load(ds2, ds)
        assert_array_equal_(ds[name(f_arr)], ds2[name(f_arr)])
示例#22
0
 def test_strstr_cat(self):
     result = self.cat_symbol.str.strstr('A')
     inv = rt.INVALID_DICT[np.dtype(result.dtype).num]
     expected = rt.FA([inv, 0, 0, -1, -1, -1], dtype=result.dtype).tile(3)
     assert_array_equal(result, expected)
示例#23
0
def _get_categorical_multikey_data() -> List[List[rt.FastArray]]:
    # Create data that can be used to construct MultiKey Categoricals where both keys are
    # strings, numerics, and combination of the two.
    strings = get_categorical_base_data()
    numerics = _get_categorical_numeric_data()

    results = []
    # consider parameterizing over the number of keys instead of literal handling of up to four keys
    for values in strings + numerics:  # two keys of same dtype and value
        results.append([rt.FA(values), rt.FA(values)])
    for values, values1 in zip(strings,
                               numerics):  # two keys of different dtypes
        results.append([rt.FA(values), rt.FA(values1)])
    for values, values1, values2 in zip(
            strings, strings, numerics):  # three keys of different dtypes
        results.append([rt.FA(values), rt.FA(values1), rt.FA(values2)])
    for values, values1, values2, values3 in zip(
            strings, strings, strings,
            numerics):  # four keys of different dtypes
        results.append(
            [rt.FA(values),
             rt.FA(values1),
             rt.FA(values2),
             rt.FA(values3)])
    return results
示例#24
0
 def test_strlen_parallel(self):
     result = rt.FAString(NB_PARALLEL_SYMBOLS).strlen
     expected = rt.FA([4, 4, 2, 4, 3],
                      dtype=result.dtype).tile(PARALLEL_MULTIPLIER)
     assert_array_equal(result, expected)
示例#25
0
 def test_strlen_cat(self):
     result = self.cat_symbol.str.strlen
     inv = rt.INVALID_DICT[np.dtype(result.dtype).num]
     expected = rt.FA([inv, 4, 4, 2, 4, 3], dtype=result.dtype).tile(3)
     assert_array_equal(result, expected)
示例#26
0
 def test_removetrailing_empty(self) -> None:
     arr = rt.FA([], dtype='S11')  # empty array
     result = arr.str.removetrailing()
     assert_array_equal(arr, result)
示例#27
0
 def test_index_any_of_cat(self):
     result = self.cat_symbol.str.index_any_of('PZG')
     inv = rt.INVALID_DICT[np.dtype(result.dtype).num]
     expected = rt.FA([inv, 2, 2, -1, 0, -1], dtype=result.dtype).tile(3)
     assert_array_equal(result, expected)
    result = fa.duplicated(keep=keep)
    for r, e in zip(result, expected):
        assert r == e


@pytest.mark.parametrize(
    "decay_rate, filter, reset, dtype_override, expected",
    [
        # Decay rate == 0 means there is no decay; since there's no decay,
        # there's effectively no time component to the EMA so we just have a cumsum.
        (
            0,
            None,
            None,
            None,
            rt.FA([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]),
        ),
        # Simple use case with a 50% decay rate per time-unit (the actual time unit used doesn't matter)
        (
            0.5,
            None,
            None,
            None,
            rt.FA([
                1.0,
                1.606531,
                2.606531,
                1.958889,
                2.188126,
                2.327166,
                2.812398,