示例#1
0
    def test_sample(self):
        # Test Dataset.sample
        ds = rt.Dataset({'num': [1, 2, 3, 4, 5], 'str': ['ab', 'bc', 'cd', 'de', 'ef']})
        np.random.seed(1)
        ds_sample = ds.sample(3, rt.FA([True, True, True, False, True]))
        ds_sample_expected = rt.Dataset({'num': [1, 3, 5], 'str': ['ab', 'cd', 'ef']})
        assert (ds_sample_expected == ds_sample).all(axis=None)

        # Test FastArray.sample
        fa = rt.FA([1, 2, 3, 4, 5])
        np.random.seed(1)
        fa_sample = fa.sample(2, rt.FA([False, True, True, False, True]))
        fa_sample_expected = rt.FA([2, 5])
        assert (fa_sample_expected == fa_sample).all(axis=None)

        # Test overflow
        fa_sample = fa.sample(10, rt.FA([False, True, False, False, True]))
        fa_sample_expected = rt.FA([2, 5])
        assert (fa_sample_expected == fa_sample).all(axis=None)

        # Test no filter
        np.random.seed(1)
        fa_sample = fa.sample(2)
        fa_sample_expected = rt.FA([2, 3])
        assert (fa_sample_expected == fa_sample).all(axis=None)

        # Test fancy index
        np.random.seed(1)
        fa_sample = fa.sample(2, rt.FA([1, 3, 4]))
        fa_sample_expected = rt.FA([2, 5])
        assert (fa_sample_expected == fa_sample).all(axis=None)
示例#2
0
    def test_sample(self):
        # Test Dataset.sample
        ds = rt.Dataset({'num': [1, 2, 3, 4, 5], 'str': ['ab', 'bc', 'cd', 'de', 'ef']})
        ds_sample = ds.sample(3, rt.FA([True, True, True, False, True]), seed=1)
        ds_sample_expected = rt.Dataset({'num': [1, 2, 5], 'str': ['ab', 'bc', 'ef']})
        assert ds_sample.keys() == ds_sample_expected.keys()
        for col_name in ds_sample_expected.keys():
            assert_array_equal(ds_sample_expected[col_name], ds_sample[col_name], err_msg=f"Column '{col_name}' differs.")

        # Test FastArray.sample
        fa = rt.FA([1, 2, 3, 4, 5])
        fa_sample = fa.sample(2, rt.FA([False, True, True, False, True]), seed=1)
        fa_sample_expected = rt.FA([2, 3])
        assert_array_equal(fa_sample_expected, fa_sample)

        # Test overflow
        fa_sample = fa.sample(10, rt.FA([False, True, False, False, True]), seed=1)
        fa_sample_expected = rt.FA([2, 5])
        assert_array_equal(fa_sample_expected, fa_sample)

        # Test no filter
        fa_sample = fa.sample(2, seed=1)
        fa_sample_expected = rt.FA([2, 3])
        assert_array_equal(fa_sample_expected, fa_sample)

        # Test fancy index
        fa_sample = fa.sample(2, rt.FA([1, 3, 4]), seed=1)
        fa_sample_expected = rt.FA([2, 4])
        assert_array_equal(fa_sample_expected, fa_sample)
示例#3
0
def get_doctest_dataset_data():
    return {
        'ds_simple_1':
        rt.Dataset({
            'A': [0, 1, 6, 7],
            'B': [1.2, 3.1, 9.6, 21]
        }),
        'ds_simple_2':
        rt.Dataset({
            'X': [0, 1, 6, 9],
            'C': [2.4, 6.2, 19.2, 53]
        }),
        'ds_complex_1':
        rt.Dataset({
            'A': [0, 6, 9, 11],
            'B': ['Q', 'R', 'S', 'T'],
            'C': [2.4, 6.2, 19.2, 25.9]
        }),
        'ds_complex_2':
        rt.Dataset({
            'A': [0, 1, 6, 10],
            'B': ['Q', 'R', 'R', 'T'],
            'E': [1.5, 3.75, 11.2, 13.1],
        }),
    }
class TestPyarrowConvertDataset:
    @pytest.mark.parametrize(('rt_dset',), [
        pytest.param(rt.Dataset({}), id='empty'),
        pytest.param(rt.Dataset({
            'ink_capacity': rt.FA([15, 10, 15, 25, 10, 15, 25, 15]),
            'purchase_date': rt.Date(['2019-06-19', '2019-06-19', '2020-01-15', '2020-05-22', '2020-02-10', '2020-02-10', '2020-03-17', '2020-03-17']),
            'country_code': rt.Categorical(
                # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats.
                [36, 36, 344, 840, 840, 124, 36, 484],
                {
                    'IRL': 372, 'USA': 840, 'AUS': 36, 'HKG': 344, 'JPN': 392,
                    'MEX': 484, 'KHM': 116, 'THA': 764, 'JAM': 388, 'ARM': 51
                }, ordered=True)
            })
        )
    ])
    def test_roundtrip_rt_pa_rt(self, rt_dset: rt.Dataset) -> None:
        """Test round-tripping from rt.Dataset to pyarrow.Table and back."""
        result_pa_tbl = rt_dset.to_arrow()
        result_rt_dset = rt.Dataset.from_arrow(result_pa_tbl, zero_copy_only=False)

        assert rt_dset.keys() == result_rt_dset.keys()
        for col_name in rt_dset.keys():
            # relaxed_cat_check=True, because we're not trying to test specific details of Categorical conversion
            # here, we're more interested in the dataset-level stuff.
            assert_array_or_cat_equal(rt_dset[col_name], result_rt_dset[col_name], relaxed_cat_check=True)
示例#5
0
    def test_accum_cols_multikey(self):
        num_rows = 12
        data = rt.Dataset({
            'Symb':
            rt.Cat(['A', 'B'] * int(num_rows / 2)),
            'Exch':
            rt.Cat(['X', 'Y', 'Y', 'X'] * int(num_rows / 4)),
            'Count':
            rt.full(num_rows, 1.0),
            'PlusMinus': [1.0, -1.0] * int(num_rows / 2),
        })
        data.MultiKeyCat = rt.Cat([data.Symb, data.Exch])

        accum = rt.accum_cols(data.MultiKeyCat, [data.Count, data.PlusMinus],
                              ['Count', 'PlusMinus'])
        accum_expected = rt.Dataset({
            'Symb': ['A', 'B', 'A', 'B'],
            'Exch': ['X', 'Y', 'Y', 'X'],
            'Count': [3.0, 3.0, 3.0, 3.0],
            'PlusMinus': [3.0, -3.0, 3.0, -3.0],
        })
        accum_expected.footer_set_values('Total', {
            'Exch': 'Total',
            'Count': 12.0,
            'PlusMinus': 0.0
        })

        self.assertTrue((accum == accum_expected).all(axis=None))
 def test_aggs_var_symb_0_25_ncols_5(self):
     test_class = categorical_base(5, 0.25, "var")
     cat = rt.Categorical(
         values=test_class.bin_ids,
         categories=test_class.keys,
         base_index=default_base_index,
     )
     cat = cat.var(rt.Dataset(test_class.data))
     gb = pd.DataFrame(test_class.data)
     gb = gb.groupby(test_class.bin_ids).var()
     for k, v in test_class.data.items():
         safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
 def test_aggs_mean_symb_0_40_ncols_6(self):
     test_class = categorical_base(6, 0.40, "mean")
     cat = rt.Categorical(
         values=test_class.bin_ids,
         categories=test_class.keys,
         base_index=default_base_index,
     )
     cat = cat.mean(rt.Dataset(test_class.data))
     gb = pd.DataFrame(test_class.data)
     gb = gb.groupby(test_class.bin_ids).mean()
     for k, v in test_class.data.items():
         safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
 def test_aggs_sum_symb_0_10_ncols_7(self):
     test_class = categorical_base(7, 0.10, "sum")
     cat = rt.Categorical(
         values=test_class.bin_ids,
         categories=test_class.keys,
         base_index=default_base_index,
     )
     cat = cat.sum(rt.Dataset(test_class.data))
     gb = pd.DataFrame(test_class.data)
     gb = gb.groupby(test_class.bin_ids).sum()
     for k, v in test_class.data.items():
         safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
示例#9
0
class TestHStackAny:
    """Tests for the rt.hstack_any (a.k.a. rt.stack_rows) function."""

    _fa1 = rt.FastArray([100, 200])
    _fa2 = rt.FastArray([111, 222])
    _dtn1 = rt.DateTimeNano('2021-10-12 01:02:03', from_tz='UTC')
    _dtn2 = rt.DateTimeNano('1980-03-04 13:14:15', from_tz='UTC')
    _ts1 = _dtn1 - _dtn2
    _ts2 = _dtn2 - _dtn1
    _ds1 = rt.Dataset({'a': 11})
    _ds2 = rt.Dataset({'b': 22})
    _pds1 = rt.PDataset(_ds1)
    _pds2 = rt.PDataset(_ds2)

    @pytest.mark.parametrize(
        "inputs,expected",
        [
            pytest.param([_fa1, _fa2], rt.FastArray, id='FastArray,FastArray'),
            pytest.param([_dtn1, _dtn2], rt.DateTimeNano, id='DateTimeNano,DateTimeNano'),
            pytest.param([_dtn1, _dtn2], rt.DateTimeNano, id='DateTimeNano,DateTimeNano'),
            pytest.param([_ts1, _ts2], rt.TimeSpan, id='TimeSpan,TimeSpan'),
            pytest.param([_ds1, _ds2], rt.Dataset, id='Dataset,Dataset'),
            pytest.param([_pds1, _pds2], None, id='PDataset,PDataset'), # notyet

            pytest.param([_dtn1, _ts2], None, id='DateTimeNano,TimeSpan'), # neither is base
            pytest.param([_fa1, _dtn2], rt.FastArray, id='FastArray,DateTimeNano'),
            pytest.param([_ts1, _fa2], rt.FastArray, id='TimeSpan,FastArray'),

            pytest.param([_ds1, _pds2], rt.Dataset, id='Dataset,PDataset'),
            pytest.param([_pds1, _ds2], rt.Dataset, id='PDataset,Dataset'),
            pytest.param([_fa1, _ds2], None, id='FastArray,Dataset'),
        ],
    )
    def test_hstack_any(self, inputs, expected):
        if expected is None:
            with pytest.raises(Exception):
                rt.hstack_any(inputs)
        else:
            result = rt.hstack_any(inputs)
            assert type(result) == expected
示例#10
0
    def test_accum_cols_noncat(self):
        num_rows = 10
        pointer = rt.FA([0, 1] * int(num_rows / 2))
        count = rt.full(num_rows, 1.0)

        accum = rt.accum_cols(pointer, count)
        accum_expected = rt.Dataset({'YLabel': [0, 1], 'col0': [5.0, 5.0]})
        accum_expected.footer_set_values('Total', {
            'YLabel': 'Total',
            'col0': 10.0
        })

        self.assertTrue((accum == accum_expected).all(axis=None))
示例#11
0
    def test_alignmk(self):
        ds1 = rt.Dataset()
        ds1['Time'] = [0, 1, 4, 6, 8, 9, 11, 16, 19, 30]
        ds1['Px'] = [10, 12, 15, 11, 10, 9, 13, 7, 9, 10]

        ds2 = rt.Dataset()
        ds2['Time'] = [0, 0, 5, 7, 8, 10, 12, 15, 17, 20]
        ds2['Vols'] = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]

        # Categorical keys
        ds1['Ticker'] = rt.Categorical(['Test'] * 10)
        ds2['Ticker'] = rt.Categorical(['Test', 'Blah'] * 5)
        res = alignmk(ds1.Ticker, ds2.Ticker, ds1.Time, ds2.Time)
        target = rt.FastArray([0, 0, 0, 2, 4, 4, 4, 6, 8, 8])
        assert_array_equal(res, target)

        # char array keys
        ds1['Ticker'] = rt.FastArray(['Test'] * 10)
        ds2['Ticker'] = rt.FastArray(['Test', 'Blah'] * 5)
        res = alignmk(ds1.Ticker, ds2.Ticker, ds1.Time, ds2.Time)
        target = rt.FastArray([0, 0, 0, 2, 4, 4, 4, 6, 8, 8])
        assert_array_equal(res, target)
示例#12
0
    def test_save_load_dataset_array(self, arr, tmpdir):
        # Test #1: save and load of ndarray within Dataset
        fn = str(tmpdir.join(name(arr)))

        ds = rt.Dataset({name(arr): arr})

        ds.save(fn)
        ds2 = rt.Dataset.load(fn)

        assert_save_load(ds2, ds)
        assert_array_equal_(ds2[name(arr)], ds[name(arr)])

        # Test #2: save and load of FastArray derived from ndarray within Dataset
        f_arr = rt.FA(arr)
        fn = str(tmpdir.join(name(f_arr)))

        ds = rt.Dataset({name(f_arr): f_arr})
        ds.save(fn)
        ds2 = rt.Dataset.load(fn)

        assert_save_load(ds2, ds)
        assert_array_equal_(ds[name(f_arr)], ds2[name(f_arr)])
示例#13
0
    def test_accum_cols(self):

        num_rows = 10
        data = rt.Dataset({
            'Symb': rt.Cat(['A', 'B'] * int(num_rows / 2)),
            'Count': rt.full(num_rows, 1.0),
            'PlusMinus': [1.0, -1.0] *
            int(num_rows / 2),  # Added to handle edge case of zero footer
        })

        accum = rt.accum_cols(data.Symb, [data.Count, data.PlusMinus],
                              ['Count', 'PlusMinus'])
        accum_expected = rt.Dataset({
            'Symb': ['A', 'B'],
            'Count': [5.0, 5.0],
            'PlusMinus': [5.0, -5.0]
        })
        accum_expected.footer_set_values('Total', {
            'Symb': 'Total',
            'Count': 10.0,
            'PlusMinus': 0.0
        })
        self.assertTrue((accum == accum_expected).all(axis=None))
示例#14
0
    def test_advanced_multikey(self):
        ##data generation code
        alpha = 'Q W E R T Y U I O P A S D F G H J K L Z X C V B N M'.split(' ')
        digits = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]

        sz = 200
        numb_kvs = 5  # can't be more than 26 as we are usign the contents of alpha for the column name

        # 2d array of keys/values
        vals = [[0] * sz] * numb_kvs
        keys = [[''] * sz] * numb_kvs

        # random initialization for them
        for n in range(0, numb_kvs):
            for i in range(0, sz):
                vals[n][i] = digits[rand.randint(0, 1000) % len(digits)]
                keys[n][i] = alpha[rand.randint(0, 1000) % len(alpha)]

        # create the data map
        # multi key hash for numbkeys 1:numb_kvs
        while numb_kvs > 0:
            data = {}
            for n in range(0, numb_kvs):
                data[alpha[n]] = keys[n]
                data[alpha[n + numb_kvs]] = vals[n]

            key_cols = alpha[0:numb_kvs]
            val_cols = alpha[numb_kvs : numb_kvs * 2]

            # print('SFW--------------------------------------------------------------')
            mset = rt.Dataset(data)
            # t = time.time()
            s_group = rt.GroupBy(mset, keys=key_cols).sum()
            # print(time.time() - t, 'SFW GROUP BY ')

            # print('PANDAS--------------------------------------------------------------')
            df2 = pd.DataFrame(data)
            # t = time.time()
            p_group = df2.groupby(key_cols).sum()
            # print(time.time() - t, 'PANDAS GROUP BY ')
            # print('compare out--------------------------------------------------------------')

            pandas_ = list(p_group[val_cols])
            sfw_ = list(s_group[val_cols])

            assert pandas_ == sfw_

            numb_kvs = numb_kvs - 1
示例#15
0
        def inner(cat):
            # Test #1: save and load Categorical
            fn = str(tmpdir.join(name(cat)))

            save_sds(fn, cat)
            cat2 = load_sds(fn)

            assert_save_load(cat2, cat)
            assert cat == cat2

            # Test #2: save and load Categorical from within Dataset
            ds = rt.Dataset({name(cat): cat})

            ds.save(fn)
            ds2 = rt.Dataset.load(fn)

            assert_save_load(ds2, ds)
            assert ds[name(cat)] == ds2[name(cat)]
示例#16
0
    def test_single_col_groupby_tests(self):

        Values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
        Keys = ['a', 'b', 'c', 'a', 'b', 'c', 'd', 'e', 'f']
        for type_ in type_list:

            data = {'Vs': rt.FastArray(Values, dtype=type_), 'Ks': Keys}

            pd_data = pd.DataFrame(data)
            sfw_data = rt.Dataset(data)

            key = 'Ks'
            val = 'Vs'

            pd_gb = pd_data.groupby(key)
            sfw_gb = sfw_data.groupby(key)

            for name in functions_str:

                pd_func = getattr(pd_gb, name)
                sfw_func = getattr(sfw_gb, name)

                pd_out = pd_func()
                sfw_out = sfw_func()

                pd_col = pd_out[val]._values
                if name == 'count':
                    sfw_col = sfw_out['Count']
                else:
                    sfw_col = sfw_out[val]

                is_integer_subttype = np.issubdtype(type_, np.integer)
                is_median = name != 'median'
                if not safe_equal(pd_col, sfw_col) and (not is_integer_subttype
                                                        and not is_median):
                    print('data_type_t = ', type_)
                    print('function =', name)
                    print('pandas output =', pd_col)
                    print('sfw    output =', sfw_col)
                    # TODO move as error message following assert
                    self.assertTrue(False)
示例#17
0
    def test_apply_nonreduce(self):
        arrsize = 200
        numrows = 7
        ds = rt.Dataset({'time': rt.arange(arrsize * 1.0)})
        ds.data = arange(arrsize) % numrows
        ds.data2 = (arange(arrsize) + 3) % numrows
        symbols = [
            'AAPL',
            'AMZN',
            'FB',
            'GOOG',
            'IBM',
            '6',
            '7',
            '8',
            '9',
            '10',
            '11',
            '12',
            '13',
            '14',
            '15',
            '16',
            '17',
            '18',
        ]
        ds.symbol = rt.Cat(1 + rt.arange(arrsize) % len(symbols), symbols)
        result = ds.symbol.apply_reduce(lambda x, y: np.sum(np.minimum(x, y)),
                                        (ds.data, ds.data))

        ac = ds.accum2('symbol', 'data')
        newds = ac.apply_nonreduce(np.cumsum)
        ds2 = ac.apply_reduce(lambda x, y: np.sum(np.maximum(x, y)),
                              (newds.data, newds.data2))

        x = np.maximum(newds.data, newds.data2)
        y = ac.apply_nonreduce(lambda x, y: np.maximum(x, y),
                               (newds.data, newds.data2))[0]
        self.assertTrue(np.all(x == y))
示例#18
0
    def test_groupby_categorical_sort(self):
        """
        Test that groupby on a categorical sorts the dataset correctly
        """
        ds = rt.Dataset()
        cats = ['z', 'y', 'x', 'w', 'a', 'b', 'c', 'd']
        vals = [0, 1, 2, 3, 4, 5, 6, 7]
        expected = dict(zip(cats, vals))

        ds["Cat"] = rt.Categorical([cats[xx % len(cats)] for xx in range(100)])

        # two identical columns
        ds["Value1"] = [vals[xx % len(cats)] for xx in range(100)]
        ds["Value2"] = [vals[xx % len(cats)] for xx in range(100)]

        grp = ds.groupby("Cat").mean()
        grp["Expected"] = [expected[xx] for xx in grp.Cat.astype('U')]

        diff = rt.sum(rt.abs(grp.Expected - grp.Value1))
        diff += rt.sum(rt.abs(grp.Expected - grp.Value2))

        assert diff <= 1e-9
示例#19
0
    def test_stack_save_load(self, dataframe, stack_count, tmpdir, stack):
        def assert_stack_equal(pds, ds, num_stack=1):
            assert id(pds) != id(
                ds
            ), f"Identity of saved {name(ds)} should be different from the loaded {name(ds)}."
            assert isinstance(pds, rt.PDataset), f"got type {type(pds)}"
            assert pds.shape == (
                num_stack * ds.shape[0],
                ds.shape[1],
            ), f"Shapes should be the same.\n{name(ds)}\n{repr(ds)}\n{name(pds)}\n{pds}"
            # TODO consider stacking
            # for f_arr1, f_arr2 in zip(pds.values(), ds.values()):
            #     assert_array_equal_(f_arr2._np, f_arr1._np)

        fn = str(tmpdir.join(name(dataframe)))

        ds = rt.Dataset(dataframe)
        save_sds(fn, ds)

        for i in range(stack_count):
            # expectations for empty input
            if i == 0:
                if stack:
                    with pytest.raises(ValueError):
                        _ = load_sds([fn] * i, stack=stack)
                else:
                    pds = load_sds([fn] * i, stack=stack)
                    assert isinstance(pds, type(None)), f"got type {type(pds)}"
                continue

            # expectations for n+1 input where n is a positive nonzero integer
            pds = load_sds([fn] * i, stack=stack)

            if stack:
                assert_stack_equal(pds, ds, num_stack=i)
            else:
                # handle expectations for non-stacked load
                assert isinstance(pds, list), f"got type {type(pds)}"
示例#20
0
    def test_multkey(self):
        alpha = 'Q W E R T Y U I O P A S D F G H J K L Z X C V B N M'.split(' ')
        digits = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]

        sz = 4000
        numbers = [0] * sz
        keys1 = [''] * sz
        keys2 = [''] * sz

        for i in range(0, sz):
            numbers[i] = digits[rand.randint(0, 1000) % len(digits)]
            keys1[i] = alpha[rand.randint(0, 1000) % len(alpha)]
            keys2[i] = alpha[rand.randint(0, 1000) % len(alpha)]

        ary = rt.FastArray(numbers)

        data = {'k1': keys1, 'k2': keys2, 'beta': numbers}

        # print('SFW--------------------------------------------------------------')
        mset = rt.Dataset(data)

        # t = time.time()
        s_group = rt.GroupBy(mset, keys=['k1', 'k2']).sum()
        # print(time.time() - t, 'SFW GROUP BY ')

        # print('PANDAS--------------------------------------------------------------')
        df2 = pd.DataFrame(data)

        # t = time.time()
        p_group = df2.groupby(['k1', 'k2']).sum()
        # print(time.time() - t, 'PANDAS GROUP BY ')
        # print('compare out--------------------------------------------------------------')

        pandas = list(p_group['beta'])
        sfw = list(s_group['beta'])
        assert pandas == sfw
示例#21
0
def numpy_array_to_dataset(inarray: numpy.ndarray, columns=None):
    out = rt.Dataset(numpy_array_to_dict(inarray, columns=columns))
    return out
示例#22
0
    def test_accum_ratiop(self):

        num_rows = 12
        data = rt.Dataset({
            'Symb':
            rt.Cat(['A', 'A', 'A', 'B'] * int(num_rows / 4)),
            'Exch':
            rt.Cat(['Z', 'Z', 'X', 'X'] * int(num_rows / 4)),
            'Count':
            rt.full(num_rows, 1.0),
        })

        # Invalid input
        with self.assertRaises(
                ValueError,
                msg=f'Failed to raise an error when passing invalid norm_by arg'
        ):
            rt.accum_ratiop(data.Symb, data.Exch, data.Count, norm_by='z')

        # Ratio within total
        accum = rt.accum_ratiop(data.Symb, data.Exch, data.Count, norm_by='T')
        accum_expected = rt.Dataset({
            'Symb': ['A', 'B'],
            'X': [25.0, 25.0],
            'Z': [50.0, 0.0],
            'TotalRatio': [75.0, 25.0],
            'Total': [9.0, 3.0],
        })
        accum_expected.footer_set_values(
            'TotalRatio',
            {
                'Symb': 'TotalRatio',
                'X': 50.0,
                'Z': 50.0,
                'TotalRatio': 100.0
            },
        )
        accum_expected.footer_set_values('Total', {
            'Symb': 'Total',
            'X': 6.0,
            'Z': 6.0,
            'Total': 12.0
        })
        self.assertTrue((accum == accum_expected).all(axis=None))

        # Ratio within columns
        accum = rt.accum_ratiop(data.Symb, data.Exch, data.Count, norm_by='c')
        accum_expected = rt.Dataset({
            'Symb': ['A', 'B'],
            'X': [50.0, 50.0],
            'Z': [100.0, 0.0],
            'TotalRatio': [75.0, 25.0],
            'Total': [9.0, 3.0],
        })
        accum_expected.footer_set_values(
            'TotalRatio',
            {
                'Symb': 'TotalRatio',
                'X': 100.0,
                'Z': 100.0,
                'TotalRatio': 100.0
            },
        )
        accum_expected.footer_set_values('Total', {
            'Symb': 'Total',
            'X': 6.0,
            'Z': 6.0,
            'Total': 12.0
        })
        self.assertTrue((accum == accum_expected).all(axis=None))
示例#23
0
    def test_multi_col_groupby_tests(self,
                                     numb_keys_and_values=5,
                                     numb_rows=20):
        col_val_names = ['alpha', 'beta', 'gamma', 'sigma', 'zeta']
        col_key_names = ['lions', 'tigers', 'bears', 'oh', 'my']

        MAX_LENGTH = min(len(col_val_names), len(col_key_names))
        assert numb_keys_and_values <= MAX_LENGTH
        for type_ in type_list:

            vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
            keys = 'a b c d e f g'.split(' ')

            vs = []
            ks = []

            for i in range(0, numb_keys_and_values):
                vs.append([
                    vals[rand.randint(0,
                                      len(vals) - 1)]
                    for i in range(0, numb_rows)
                ])
                ks.append([
                    keys[rand.randint(0,
                                      len(keys) - 1)]
                    for i in range(0, numb_rows)
                ])

            data = {}

            for i in range(0, numb_keys_and_values):
                data[col_val_names[i]] = rt.FastArray(vs[i], dtype=type_)
                data[col_key_names[i]] = rt.FastArray(vs[i], dtype=type_)

            pd_data = pd.DataFrame(data)
            sfw_data = rt.Dataset(data)
            key = col_key_names[0:numb_keys_and_values]
            val = col_val_names[0:numb_keys_and_values]

            pd_gb = pd_data.groupby(key)
            sfw_gb = sfw_data.groupby(key)

            for name in functions_str:
                pd_out = getattr(pd_gb, name)()
                sfw_out = getattr(sfw_gb, name)()

                if name == 'count':
                    # only compare one column for count
                    pd_col = pd_out['alpha']
                    sfw_col = sfw_out.Count
                    if not safe_equal(pd_col, sfw_col):
                        print('function =', name)
                        print('pandas output =', pd_col)
                        print('sfw    output =', sfw_col)
                        self.assertTrue(False)
                else:
                    for val in col_val_names:
                        # extract array from pandas series
                        pd_col = pd_out[val]._values
                        sfw_col = sfw_out[val]

                        is_integer_subttype = np.issubdtype(type_, np.integer)
                        is_median = name != 'median'
                        if not safe_equal(pd_col, sfw_col) and (
                                not is_integer_subttype and not is_median):
                            print('function =', name)
                            print('pandas output =', pd_col)
                            assert False
示例#24
0
    def test_save_load_datasets(self, dataframe, tmpdir):
        # generate a dataframe of all the dtypes
        # all array types
        # copy itself and create nested datasets and sibling datasets

        # Test #1: save and load of DataFrame
        fn = str(tmpdir.join(name(dataframe)))
        # save_sds(fn, dataframe)
        # dataframe2 = load_sds(fn)
        # assert dataframe2 == dataframe
        # E TypeError: save_sds() can only save Structs, Datasets, or single arrays. Got <class 'pandas.core.frame.DataFrame'>
        # ..\rt_sds.py:470: TypeError

        # Test #2: save and load of Dataset created from DataFrame
        dataset = rt.Dataset(dataframe)

        save_sds(fn, dataset)
        dataset2 = load_sds(fn)

        assert_save_load(dataset2, dataset)
        for f_arr1, f_arr2 in zip(dataset.values(), dataset2.values()):
            assert_array_equal_(f_arr2._np, f_arr1._np)

        # Test #3: save and load nested Dataset within a Multiset
        # This also tests that shallow and deep copies that are saved and loaded from SDS
        # are both unique objects with the same size footprint.
        multiset = rt.Multiset()
        shallow_copy_name, deep_copy_name = "dataset_shallow_copy", "dataset_deep_copy"
        dataset_shallow_copy, dataset_deep_copy = (
            dataset.copy(deep=False),
            dataset.copy(deep=True),
        )
        multiset[shallow_copy_name], multiset[deep_copy_name] = (
            dataset_shallow_copy,
            dataset_deep_copy,
        )

        fn = str(tmpdir.join(name(multiset)))
        save_sds(fn, multiset)
        multiset2 = load_sds(fn)

        assert_save_load(multiset2, multiset)
        # Shallow copy assertions
        assert id(multiset[shallow_copy_name]) != id(
            multiset2[shallow_copy_name]
        ), f"Identity of saved object should be different from the loaded object."
        for f_arr1, f_arr2 in zip(multiset[shallow_copy_name].values(),
                                  multiset2[shallow_copy_name].values()):
            # Convert these to ndarrays so we don't need to consider Riptable invalid checks.
            # This test is concerned with ensuring the same data is loaded as saved.
            assert_save_load(f_arr2, f_arr1)
            assert_array_equal_(f_arr2._np, f_arr2._np)

        # Deep copy assertions
        assert id(multiset[deep_copy_name]) != id(
            multiset2[deep_copy_name]
        ), f"Identity of saved object should be different from the loaded object."
        for f_arr1, f_arr2 in zip(multiset[deep_copy_name].values(),
                                  multiset2[deep_copy_name].values()):
            assert_save_load(f_arr2, f_arr1)
            assert_array_equal_(f_arr2._np, f_arr2._np)
示例#25
0
    def test_meta(self):
        st = rt.Struct({
            'a':
            rt.Dataset({
                'col1': rt.FastArray([1, 2]).astype(np.int32),
                'col2': rt.FastArray([3, 4]).astype(np.int32),
                'col4': rt.FastArray([5, 6]).astype(np.int32),
            }),
            'b':
            rt.FastArray([3, 4]).astype(np.int32),
        })
        out = StringIO()
        orig_stdout = sys.stdout
        sys.stdout = out
        print(st.info())
        output = out.getvalue()
        target_output = '''\x1b[1;36mDescription: \x1b[00m<no description>
\x1b[1;36mSteward: \x1b[00m<no steward>
\x1b[1;36mType: \x1b[00mStruct
\x1b[1;36mContents:\x1b[00m

\x1b[1;36mType     Name  Description                                         Steward     \x1b[00m
\x1b[1;36m-------  ----  --------------------------------------------------  ------------\x1b[00m
Dataset  \x1b[1;32ma   \x1b[00m  <no description>                                    <no steward>
int32    \x1b[1;32mb   \x1b[00m  <no description>                                    <no steward>
'''
        self.assertEqual(output, target_output)

        schema = {'Description': 'This is a structure', 'Steward': 'Nick'}
        st.apply_schema(schema)
        st2 = rt.Struct({
            'This': st,
            'That': np.array([1, 2]).astype(np.int32)
        })
        out = StringIO()
        sys.stdout = out
        print(st2.info())
        output = out.getvalue()
        target_output = '''\x1b[1;36mDescription: \x1b[00m<no description>
\x1b[1;36mSteward: \x1b[00m<no steward>
\x1b[1;36mType: \x1b[00mStruct
\x1b[1;36mContents:\x1b[00m

\x1b[1;36mType    Name  Description                                         Steward     \x1b[00m
\x1b[1;36m------  ----  --------------------------------------------------  ------------\x1b[00m
Struct  \x1b[1;32mThis\x1b[00m  This is a structure                                 Nick        
int32   \x1b[1;32mThat\x1b[00m  <no description>                                    <no steward>
'''
        self.assertEqual(output, target_output)

        schema = {
            'Description': 'This is a structure',
            'Steward': 'Nick',
            'Type': 'Struct',
            'Contents': {
                'This': {
                    'Description': 'This is a nested structure',
                    'Steward': 'Bob',
                    'Type': 'AttackHelicoptor',
                    'Contents': {
                        'a': {
                            'Description': 'A description for a',
                            'Steward': 'Fred',
                            'Contents': {
                                'col1': {
                                    'Description': 'This describes column 1',
                                    'Steward': 'Jay',
                                    'Type': 'int32',
                                },
                                'col2': {
                                    'Description': 'This describes column 2',
                                    'Steward': 'Alex',
                                    'Type': 'float32',
                                },
                                'col3': {
                                    'Description': 'This column is not there',
                                    'Steward': 'Ben',
                                },
                            },
                        },
                        'b': {
                            'Description': 'A descriptiion for b',
                            'Steward': 'George',
                        },
                    },
                },
                'That': {
                    'Description': 'This is an array',
                    'Steward': 'Willy'
                },
            },
        }

        res = st2.apply_schema(schema)
        res_c = {
            'This': {
                'Type Mismatch':
                'Type Struct does not match schema type AttackHelicoptor',
                'a': {
                    'col2': {
                        'Type Mismatch':
                        'Type int32 does not match schema type float32'
                    },
                    'Extra Column': 'col4',
                    'Missing Column': 'col3',
                },
            }
        }
        self.assertEqual(res, res_c)

        out = StringIO()
        sys.stdout = out
        print(st2.info())
        output = out.getvalue()
        target_output = '''\x1b[1;36mDescription: \x1b[00mThis is a structure
\x1b[1;36mSteward: \x1b[00mNick
\x1b[1;36mType: \x1b[00mStruct
\x1b[1;36mContents:\x1b[00m

\x1b[1;36mType    Name  Description                                         Steward     \x1b[00m
\x1b[1;36m------  ----  --------------------------------------------------  ------------\x1b[00m
Struct  \x1b[1;32mThis\x1b[00m  This is a nested structure                          Bob         
int32   \x1b[1;32mThat\x1b[00m  This is an array                                    Willy       
'''
        self.assertEqual(output, target_output)

        out = StringIO()
        sys.stdout = out
        print(st2.This.info())
        output = out.getvalue()
        target_output = '''\x1b[1;36mDescription: \x1b[00mThis is a nested structure
\x1b[1;36mSteward: \x1b[00mBob
\x1b[1;36mType: \x1b[00mStruct
\x1b[1;36mContents:\x1b[00m

\x1b[1;36mType     Name  Description                                         Steward     \x1b[00m
\x1b[1;36m-------  ----  --------------------------------------------------  ------------\x1b[00m
Dataset  \x1b[1;32ma   \x1b[00m  A description for a                                 Fred        
int32    \x1b[1;32mb   \x1b[00m  A descriptiion for b                                George      
'''
        self.assertEqual(output, target_output)

        out = StringIO()
        sys.stdout = out
        print(st2.This.a.info())
        output = out.getvalue()
        target_output = '''\x1b[1;36mDescription: \x1b[00mA description for a
\x1b[1;36mSteward: \x1b[00mFred
\x1b[1;36mType: \x1b[00mDataset
\x1b[1;36mContents:\x1b[00m

\x1b[1;36mType   Name  Description                                         Steward     \x1b[00m
\x1b[1;36m-----  ----  --------------------------------------------------  ------------\x1b[00m
int32  \x1b[1;32mcol1\x1b[00m  This describes column 1                             Jay         
int32  \x1b[1;32mcol2\x1b[00m  This describes column 2                             Alex        
int32  \x1b[1;32mcol4\x1b[00m  <no description>                                    <no steward>
'''
        self.assertEqual(output, target_output)

        out = StringIO()
        sys.stdout = out
        print(st2.This.a.col1.info())
        output = out.getvalue()
        target_output = '''\x1b[1;36mDescription: \x1b[00mThis describes column 1
\x1b[1;36mSteward: \x1b[00mJay
\x1b[1;36mType: \x1b[00mint32
'''
        self.assertEqual(output, target_output)

        sys.stdout = orig_stdout