예제 #1
0
    def test_nan_funcs(self):
        GroupBy.include_keys = True
        nan_dict = {
            'f32': np.arange(5, dtype=np.float32),
            'f64': np.arange(5, dtype=np.float64),
        }
        nan_dict['f32'][0] = np.nan
        nan_dict['f64'][0] = np.nan
        nan_ds = Dataset(nan_dict)
        # TODO: add when more nan functions have been implemented
        correct = np.array([1.0, 2.0, 3.0, 4.0, 0.0])
        nan_funcs = ['nansum']
        col_names = ['f32', 'f64']

        for c_name in col_names:
            gb = nan_ds.gb(c_name)
            for f_name in nan_funcs:
                call = getattr(gb, f_name)
                result_ds = call()
                for result_name in col_names:
                    column = getattr(result_ds, result_name)
                    # skip groupby column
                    if c_name != result_name:
                        is_correct = bool(np.all(column == correct))
                        self.assertTrue(
                            is_correct,
                            msg=
                            f"Incorrect result for groupby {c_name} in column {result_name} for {f_name} operation.",
                        )
        GroupBy.include_keys = False
예제 #2
0
 def get_basic_dataset(keyfield_value=None, nr=10):
     ds = Dataset({
         _k: list(range(_i * nr, (_i + 1) * nr))
         for _i, _k in enumerate('abcdefghijklmnop')
     })
     if keyfield_value is not None:
         ds.keyfield = keyfield_value
     return ds
예제 #3
0
    def test_iter(self):
        correct_keys = FastArray(['e', 'd', 'b', 'c', 'a'])
        correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]]
        str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd'])

        gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)})
        gb = gb.gb('keycol')
        for i, tup in enumerate(gb):
            self.assertEqual(tup[0], correct_keys[i])
            self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
예제 #4
0
    def test_roundtrip_rt_pa_rt(self, rt_dset: rt.Dataset) -> None:
        """Test round-tripping from rt.Dataset to pyarrow.Table and back."""
        result_pa_tbl = rt_dset.to_arrow()
        result_rt_dset = rt.Dataset.from_arrow(result_pa_tbl, zero_copy_only=False)

        assert rt_dset.keys() == result_rt_dset.keys()
        for col_name in rt_dset.keys():
            # relaxed_cat_check=True, because we're not trying to test specific details of Categorical conversion
            # here, we're more interested in the dataset-level stuff.
            assert_array_or_cat_equal(rt_dset[col_name], result_rt_dset[col_name], relaxed_cat_check=True)
예제 #5
0
    def test_nobins(self):
        """
        Tests that Categorical.median() works correctly when there
        are no bins with data because the Categorical has been created
        with a pre-filter which has filtered out all data in the Dataset.
        """

        data = Dataset()
        data.Group = np.random.randint(0, 10, 100_000)
        data.Values = np.random.randint(0, 10, 100_000)
        x = data.cat('Group', filter=data.Group < 0)
        x.median(data.Values)
예제 #6
0
    def test_gb_categoricals(self):
        codes = [1, 44, 44, 133, 75, 75, 75, 1]
        stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g']
        c1 = Categorical(codes, LikertDecision, sort_gb=True)
        c2 = Categorical(stringlist)
        d = {'nums': np.arange(8)}

        # from enum only
        d_enum = d.copy()
        d_enum['cat_from_enum'] = c1
        ds_enum = Dataset(d_enum)
        enum_result = ds_enum.gb('cat_from_enum').sum()
        correct = FastArray([3, 15, 3, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, enum_result.nums),
            msg=
            f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}",
        )

        # from list only
        d_list = d.copy()
        d_list['cat_from_list'] = c2
        ds_list = Dataset(d_list)
        list_result = ds_list.gb('cat_from_list').sum()
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, list_result.nums),
            msg=f"Incorrect sum when grouping by list categorical.",
        )

        d_both = d_enum.copy()
        d_both['cat_from_list'] = c2
        ds_both = Dataset(d_both)

        # by enum, list
        result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum()
        num_result = result.nums
        correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by enum, list categoricals.",
        )

        # by list, enum
        result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum()
        num_result = result.nums
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by list, enum categoricals.",
        )
예제 #7
0
 def test_ops(self):
     ds = Dataset({
         'test': arange(300000) % 3,
         'test2': arange(300000.0),
         'test2i': arange(300000),
         'test3': arange(300000) % 3,
     })
     gb = ds.groupby('test')
     result = gb.mean()
     self.assertTrue(result.test2[0] == result.test2i[0])
     self.assertTrue(result.test2[1] == result.test2i[1])
     self.assertTrue(result.test3[1] == 1.0)
     result = gb.median()
     result = gb.trimbr()
     result = gb.nanmedian()
예제 #8
0
    def test_cumcount_vs_gb(self):
        arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50)
        ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)})
        gb_result = ds.gb('keycol').cumcount()

        c = Categorical(ds.keycol)
        c_result = c.cumcount()

        rdiff = gb_result - c_result
        assert sum(rdiff) == 0

        f = logical(arange(50) % 2)
        c_result = c.cumcount(filter=f)
        assert bool(np.all(isnotnan(c_result[f])))
        assert bool(np.all(isnan(c_result[~f])))
예제 #9
0
    def test_gb_labels_enum(self):
        # make sure enum groupby keys are displayed as string,  not integer code
        c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], {
            'a': 30,
            'b': 20,
            'c': 10
        })
        c_result = c.count()
        c_labels = c_result[c_result.label_get_names()][0]

        ds = Dataset({'catcol': c, 'data': arange(9)})
        ds_result = ds.gbu('catcol').count()
        ds_labels = ds_result[ds_result.label_get_names()][0]

        assert c_labels.dtype.char == ds_labels.dtype.char
        assert bool(np.all(c_labels == ds_labels))
예제 #10
0
    def test_specify_gb_data(self):
        str_col = ['a', 'a', 'b', 'c', 'a']
        num_col = [10, 10, 20, 30, 10]
        col1 = np.arange(5)
        col2 = np.arange(5)
        small_ds = Dataset({
            'str_col': str_col,
            'num_col': num_col,
            'col1': col1,
            'col2': col2
        })
        ds_to_operate_on = small_ds[['col1', 'col2']]

        c = Categorical(str_col)

        # dataset
        d = c.sum(ds_to_operate_on)

        # single
        # list
        d = c.sum([col1, col2])

        # tuple
        d = c.sum((col1, col2))

        # dict
        d = c.sum({'a': col1, 'b': col2})

        # multiple
        d = c.sum(col1, col2)
예제 #11
0
    def test_groupby_ops_multikey_dict(self):
        mk_dict = {'string1': str_fa, 'string2': str_fa}
        mk_gb = Dataset({
            'string1': str_fa.copy(),
            'string2': str_fa.copy(),
            'ints': int_fa,
            'floats': flt_fa,
            'tens': tens,
        }).gbu(['string1', 'string2'])
        c = Categorical(mk_dict)
        self.funnel_all_tests(c, mk_gb, "multikey dictionary", sorted=False)

        # setitem hits comparison functions - need to rewrite these tests after comparison behavior change
        # self.mk_set_item(mk_dict, constructor_name="multikey dictionary")

        # conflicting names
        x = str_fa.copy()
        y = str_fa.copy()
        z = str_fa.copy()
        x.set_name('strings')
        y.set_name('strings')
        z.set_name('strings1')
        c = Categorical([x, y, z])
        assert c._categories_wrap.ncols ==\
            3,\
            f"incorrect number of columns for multikey from list. {c._categories_wrap.ncols} vs. 3"
        # 04/25/2019 all default column names now happen in grouping object
        assert list(c.categories().keys())\
            == ['strings', GROUPBY_KEY_PREFIX + '_c1', 'strings1'],\
            f"column names did not match for multikey from list. {list(c.categories().keys())} vs. ['strings','strings2','strings1']"
예제 #12
0
 def test_as_matrix_metadata(self):
     error_tol = 0.00001
     ds = Dataset({
         'A': ['EXCH1', 'EXCH2', 'EXCH1', 'EXCH3', 'EXCH3'],
         'B': [-1.6, 2.7, 4.6, 5.7, 8.9],
         'C':
         Categorical([0, 0, 1, 0, 2], ['CPTYA', 'CPTYB', 'CPTYC']),
     })
     X, X_data = dataset_as_matrix(ds)
     self.assertIsInstance(X, numpy.ndarray)
     self.assertEqual(X.shape[0], ds.shape[0])
     self.assertEqual(X.shape[1], ds.shape[1])  # we may break this later
     self.assertEqual(X_data['A']['dtype'], ds.A.dtype)
     self.assertEqual(X_data['B']['dtype'], ds.B.dtype)
     self.assertEqual(X_data['C']['dtype'], ds.C.dtype)
     self.assertEqual(X_data['A']['is_categorical'], False)
     self.assertEqual(X_data['B']['is_categorical'], False)
     self.assertEqual(X_data['C']['is_categorical'], True)
     self.assertTrue((numpy.abs(X[:, 0] - numpy.array([0., 1., 0., 2., 2.]))
                      < error_tol).all(),
                     msg=f"got {X[:, 0]}")
     self.assertTrue((numpy.abs(X[:, 2] - numpy.array([0, 0, 1, 0, 2])) <
                      error_tol).all(),
                     msg=f"got {X[:, 2]}")
     self.assertTrue(
         (X_data['A']['category_values'][numpy.array([0, 1, 0, 2,
                                                      2])] == ds.A).all(),
         msg=
         f"X_data {X_data['A']['category_values'][numpy.array([0, 1, 0, 2, 2])]}\nds.A {ds.A}"
     )
예제 #13
0
 def test_dset_dict_to_list(self):
     ds = Dataset({
         _k: list(range(_i * 10, (_i + 1) * 10))
         for _i, _k in enumerate('abcdefghijklmnop')
     })
     ds0 = ds[:3].copy()
     ds1 = ds[6:9].copy()
     ds2 = ds[11:15].copy()
     dd = {'one': ds0, 'two': ds1, 'μεαν': ds2}
     with self.assertRaises(ValueError):
         _ = dset_dict_to_list(dd, 'keyfield')
     dd = {'one': ds0, 'two': ds1, 3: ds2}
     with self.assertRaises(ValueError):
         _ = dset_dict_to_list(dd, 'keyfield')
     dd = {'one': ds0, 'two': ds1, 'three': ds2}
     with self.assertRaises(ValueError):
         _ = dset_dict_to_list(dd, 'a')
     lst1 = dset_dict_to_list(dd, 'keyfield')
     self.assertEqual(id(ds0), id(lst1[0]))
     self.assertEqual(id(ds1), id(lst1[1]))
     self.assertEqual(id(ds2), id(lst1[2]))
     self.assertEqual(list(ds0.keys()), ['a', 'b', 'c', 'keyfield'])
     self.assertTrue((ds0.a == list(range(10))).all())
     self.assertTrue((ds0.keyfield == 'one').all())
     lst2 = dset_dict_to_list(dd, 'a', allow_overwrite=True)
     self.assertEqual(id(ds0), id(lst1[0]))
     self.assertEqual(list(ds0.keys()), ['a', 'b', 'c', 'keyfield'])
     self.assertTrue((ds0.a == 'one').all())
     self.assertTrue((ds0.b == list(range(10, 20))).all())
     self.assertTrue((ds0.keyfield == 'one').all())
예제 #14
0
def read_dset_from_np(outdir: str, fname: str, mmap: bool = False) -> Dataset:
    """
    Read columns stored as numpy follows to a Dataset.

    Parameters
    ----------
    outdir is the path and fname is the name of the
    subdirectory containing the columns of the dataset
    set mmap = True for memmory mapping. Note this will
    allow quick loading, but has some latency cost elsewhere

    Returns
    -------
    Dataset
        The dataset read in from the specified folder.

    See Also
    --------
    write_dset_to_np
    """
    mmap_mode = None
    if mmap:
        mmap_mode = 'r'

    fname = os.path.join(outdir, fname)
    fname = os.path.join(fname, 'columns')
    col_dict = dict()
    col_names = os.listdir(path2platform(fname))
    for i in range(len(col_names)):
        fname_col = path2platform(os.path.join(fname, col_names[i]))
        curr_col_name = col_names[i].replace('.npy', '')
        col_dict[curr_col_name] = np.load(fname_col, mmap_mode=mmap_mode)
    return Dataset(col_dict)
예제 #15
0
    def test_dataset_as_pandas_df_warn(self):
        ds = Dataset({'a': [1, 2, 3]})

        if pd_version >= pd_ver_0_24:
            with self.assertWarns(FutureWarning):
                df = dataset_as_pandas_df(ds)
        else:
            df = dataset_as_pandas_df(ds)
예제 #16
0
    def test_total_sizes(self):
        st = Struct({
            'a':
            Dataset({
                # 10x int32 => 40B
                'A': range(10),
                # 10x int32 => 40B
                'B': range(10, 20),
            }),
            'b':
            Struct({
                # 1x int32 => 4B
                'C': 0,
                # 1x int32 => 4B
                'D': 1,
                # 1x int32 => 4B
                'E': 2,
            }),
            # 5x int32 => 20B
            'c':
            FastArray(np.arange(5)),
            # 5x int32 => 20B
            'd':
            np.arange(5, 10),
            # ???
            'e': ['abc', 'def', 'ghi'],
            'f': {
                # 1x int32 => 4B
                'q': 1,
                # 1x int32 => 4B
                'r': 2,
            },
            # 1x float64 => 8B
            'g':
            3.14,
            # 1x int32 => 4B
            'h':
            84,
            # ???
            'i':
            None,
            # ???
            'j':
            slice(None),
        })

        # Create some duplicated/aliased data within the struct.
        st.z = st.c

        # Calculate the sizes of the Struct's data in bytes.
        (physical, logical) = st.total_sizes

        # For now, we only check that the logical size is larger than the physical size
        # (due to the presence of aliased array(s) somewhere within the Struct).
        # TODO: Strengthen this test by checking the actual computed sizes to make sure they're correct.
        self.assertLess(
            physical, logical,
            "The physical size is not less than the logical size.")
예제 #17
0
    def test_as_categorical(self):
        ds = Dataset({
            'keycol1': np.random.choice(['a', 'b', 'c'], 30),
            'keycol2': np.random.choice(['a', 'b', 'c'], 30),
            'data': np.random.rand(30),
        })

        gbu = ds.gbu('keycol1')
        c = Categorical(ds.keycol1, ordered=False, sort_gb=False)
        cgbu = gbu.as_categorical()

        gbu_result = gbu.sum()
        c_result = c.sum(ds.data)
        cgbu_result = cgbu.sum(ds.data)

        for name, col in gbu_result.items():
            assert bool(np.all(c_result[name] == col))
            assert bool(np.all(cgbu_result[name] == col))
예제 #18
0
 def test_as_matrix(self):
     error_tol = 0.00001
     ds = Dataset({'A': [1.2, 3.1, 9.6], 'B': [-1.6, 2.7, 4.6]})
     X, _ = dataset_as_matrix(ds)
     self.assertIsInstance(X, numpy.ndarray)
     self.assertEqual(X.shape[0], ds.shape[0])
     self.assertEqual(X.shape[1], ds.shape[1])  # we may break this later
     self.assertTrue((numpy.abs(ds.A._np - X[:, 0]) < error_tol).all())
     self.assertTrue((numpy.abs(ds.B._np - X[:, 1]) < error_tol).all())
예제 #19
0
    def test_pad(self):
        arrsize = 100
        numrows = 20
        ds = Dataset({'time': arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = ['ZYGO', 'YHOO', 'FB', 'GOOG', 'IBM']
        ds.symbol2 = Cat(1 + ds.data, list('ABCDEFGHIJKLMNOPQRST'))
        ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols)
        ds.time[[3, 4, 7]] = nan
        newds = ds.gb('symbol').pad()
        self.assertTrue(newds.time[7] == 2.00)
        self.assertTrue(newds.time[3] != newds.time[3])
        newds = ds.gb('symbol').backfill()
        self.assertTrue(newds.time[7] == 12.00)

        # see if we can pull a group
        newds = ds.gb('symbol').get_group('YHOO')
        self.assertTrue(np.all(newds.symbol == 'YHOO'))
예제 #20
0
    def test_diff(self):
        import pandas as pd

        ds = Dataset({'col_' + str(i): np.random.rand(10) for i in range(5)})
        ds.keycol = np.random.choice(['a', 'b', 'c'], 10)
        df = pd.DataFrame(ds.asdict())

        rt_result = ds.gb('keycol').rolling_diff()
        pd_result = df.groupby('keycol').diff()

        for k, v in rt_result.items():
            pdc = pd_result[k]
            pdcnan = isnan(pdc)
            self.assertTrue(bool(np.all(isnan(v) == pdcnan)), msg=f'{v} {pdc}')

            masked_valid_pd = isnotnan(pdc)
            masked_valid_rt = isnotnan(v)

            self.assertTrue(bool(np.all(masked_valid_pd == masked_valid_rt)))
예제 #21
0
 def test_groupby_ops_multikey_list(self):
     mk_list = [str_fa.copy(), str_fa.copy()]
     mk_gb = Dataset({
         'string1': str_fa.copy(),
         'string2': str_fa.copy(),
         'ints': int_fa,
         'floats': flt_fa,
         'tens': tens,
     }).gbu(['string1', 'string2'])
     c = Categorical(mk_list)
     self.funnel_all_tests(c, mk_gb, "multikey list", sorted=False)
예제 #22
0
 def test_as_matrix_int(self):
     error_tol = 0.00001
     ds = Dataset({
         _k: list(range(_i * 10, (_i + 1) * 10))
         for _i, _k in enumerate('ABCDEFGHIJKLMNOP')
     })
     X, _ = dataset_as_matrix(ds)
     self.assertIsInstance(X, numpy.ndarray)
     self.assertEqual(X.shape[0], ds.shape[0])
     self.assertEqual(X.shape[1], ds.shape[1])  # we may break this later
     self.assertTrue((numpy.abs(ds.A._np - X[:, 0]) < error_tol).all())
     self.assertTrue((numpy.abs(ds.B._np - X[:, 1]) < error_tol).all())
예제 #23
0
 def test_append_dataset_dict(self):
     ds = Dataset({
         _k: list(range(_i * 10, (_i + 1) * 10))
         for _i, _k in enumerate('abcdefghijklmnop')
     })
     ds0 = ds[:3].copy()
     ds1 = ds[6:9].copy()
     ds2 = ds[11:15].copy()
     dd = {'one': ds0, 'two': ds1, 'three': ds2}
     ds = append_dataset_dict(dd, 'keyfield')
     ucols = set()
     for _d in dd.values():
         ucols.update(_d)
     self.assertEqual(set(ds.keys()), ucols)
     self.assertEqual(ds.get_nrows(),
                      sum(_d.get_nrows() for _d in dd.values()))
     keyfield = []
     for _d in dd.values():
         keyfield.extend(_d.keyfield)
     self.assertTrue((ds.keyfield == keyfield).all())
     self.assertTrue((ds.a[:10] == range(10)).all())
     self.assertTrue((ds.g[10:20] == range(60, 70)).all())
     self.assertTrue((ds.l[20:30] == range(110, 120)).all())
예제 #24
0
    def test_add_dataset(self):
        arrsize = 200
        numrows = 7

        ds = Dataset({'time': np.arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = [
            'AAPL',
            'AMZN',
            'FB',
            'GOOG',
            'IBM',
            '6',
            '7',
            '8',
            '9',
            '10',
            '11',
            '12',
            '13',
            '14',
            '15',
            '16',
            '17',
            '18',
        ]
        symbol2 = ['A', 'X', 'P', 'C', 'D', 'E', 'F', 'G', 'G', 'I', 'J', 'K']
        ds.symbol2 = Cat(1 + np.arange(arrsize) % len(symbol2), symbol2)
        ds.symbol = Cat(1 + np.arange(arrsize) % len(symbols), symbols)

        x = ds.copy()
        del x.symbol
        del x.data
        del x.time
        x.label_set_names('data2')

        # now x has two columns, and one is labelled so adding an entire dataset should just add x.symbol2
        ds.junk = x
예제 #25
0
    def test_transform(self):

        arrsize = 200
        numrows = 7

        ds = Dataset({'time': arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM']
        ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols)
        newds = ds.gb('symbol')['data'].sum(transform=True)

        # removed from test since gbkeys not returned in transform
        # self.assertTrue(np.all(newds.symbol == ds.symbol))
        catds = ds.symbol.sum(ds.data, transform=True)
        self.assertTrue(np.all(newds[0] == catds[0]))
        # test showfilter
        catds = ds.symbol.sum(ds.data, showfilter=True, transform=True)
        self.assertTrue(np.all(newds[0] == catds[0]))

        # test diff
        result1 = ds.gb('symbol').apply_nonreduce(TypeRegister.FastArray.diff)
        result2 = ds.gb('symbol').diff()
        self.assertTrue(result1.equals(result2))
예제 #26
0
    def test_shift(self):
        import pandas as pd

        ds = Dataset({'col_' + str(i): np.random.rand(30) for i in range(5)})
        ds.keycol = np.random.choice(['a', 'b', 'c'], 30)
        df = pd.DataFrame(ds.asdict())

        rt_result = ds.gb('keycol').shift(periods=-2).trim()
        pd_result = df.groupby('keycol').shift(periods=-2).dropna(axis='rows')

        for k, v in rt_result.items():
            self.assertTrue(bool(np.all(v == pd_result[k])))

        rt_result = ds.gb('keycol').shift(periods=3).trim()
        pd_result = df.groupby('keycol').shift(periods=3).dropna(axis='rows')

        for k, v in rt_result.items():
            self.assertTrue(bool(np.all(v == pd_result[k])))
예제 #27
0
def write_dset_to_np(ds: Dataset, outdir: str, fname: str) -> None:
    """
    Write the columns of a dataset to numpy binary files, one file per column in the specified directory.

    Parameters
    ----------
    ds : Dataset
        A Dataset to write out to disk.
    outdir : str
        The path to the folder where the output will be written.
    fname : str
        The name of the subdirectory to store the columns.

    See Also
    --------
    read_dset_from_np
    """
    os.makedirs(os.path.join(outdir, fname))
    fname = os.path.join(outdir, fname)
    fname = os.path.join(fname, 'columns')
    os.makedirs(path2platform(fname))
    for name, value in ds.items():
        fname_col = os.path.join(fname, str(name))
        np.save(path2platform(fname_col), value)
예제 #28
0
    def test_reductions(self):
        message_types = [
            'CREATE',
            'RUN',
            'CREATE',
            'RUN',
            'RUN',
            'RUN',
            'RUN',
            'CANCEL',
            'RUN',
            'RUN',
            'RUN',
            'CANCEL',
        ]
        order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1]
        seconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120]
        shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0]
        d2 = dict(
            message_type=message_types,
            order_id=order_ids,
            second=seconds,
            shares=shares,
        )
        dat = Dataset(d2)
        dat = dat[['order_id', 'message_type', 'second', 'shares']]

        # Numeric reduction
        dsr = dat.groupby('order_id').sum()
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.order_id, [1, 2])
        self.assertArrayEqual(dsr.second, [410, 676])
        self.assertArrayEqual(dsr.shares, [800, 1100])

        # Numeric reduction with all columns returned
        dsr = dat.groupby('order_id', return_all=True).sum()
        self.assertEqual(dsr.shape, (2, 4))
        self.assertEqual(dsr.keys()[1], 'message_type')

        # Order-based reduction
        dsr = dat.groupby('order_id').first()
        self.assertEqual(dsr.shape, (2, 4))
        self.assertArrayEqual(dsr.order_id, [1, 2])
        self.assertArrayEqual(dsr.message_type, ['CREATE', 'CREATE'])
        self.assertArrayEqual(dsr.second, [50, 72])
        self.assertArrayEqual(dsr.shares, [0, 0])

        # Order-based reduction, which returns all columns regardless
        dsr = dat.groupby('order_id', return_all=True).first()
        self.assertEqual(dsr.shape, (2, 4))

        # Order-based reduction with multiple keys
        dsr = dat.groupby(['order_id', 'message_type']).first()
        self.assertEqual(dsr.shape, (6, 4))
        self.assertArrayEqual(dsr.order_id, [1, 1, 1, 2, 2, 2])
        self.assertArrayEqual(
            dsr.message_type,
            ['CANCEL', 'CREATE', 'RUN', 'CANCEL', 'CREATE', 'RUN'])
        self.assertArrayEqual(dsr.second, [120, 50, 70, 97, 72, 90])
        self.assertArrayEqual(dsr.shares, [0, 0, 200, 0, 0, 100])

        # On a subset of columns
        gb = dat.groupby('order_id')
        dsr = gb['shares'].sum()
        self.assertEqual(dsr.shape, (2, 2))
        self.assertArrayEqual(dsr.shares, [800, 1100])

        # Accumulating function
        dsr = dat.groupby('order_id').cumsum()
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800])

        # return_all has no effect with accumulating functions
        # 8/23/2018 SJK - changed behavior so return all shows the keys
        dsr = dat.groupby('order_id', return_all=True).cumsum()
        self.assertEqual(dsr.shape, (12, 3))

        # Add cum_shares back to a dataset
        dat['cum_shares'] = dat.groupby('order_id').shares.cumsum().shares
        self.assertEqual(dat.shape, (12, 5))
        self.assertArrayEqual(dat.cum_shares, gb.shares.cumsum().shares)

        # On a subset of columns
        dsr = dat.groupby('order_id')[['shares', 'second']].cumsum()
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800])
        self.assertArrayEqual(
            dsr.second,
            [50, 120, 72, 195, 162, 250, 290, 347, 445, 560, 676, 410])

        # On a subset of columns with a filter
        f = FastArray([
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
        ])
        dsr = dat.groupby('order_id')[['shares', 'second']].cumsum(filter=f)
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares, [0, 0, 0, 0, 100, 100, 100, 100, 400, 400, 550, 100])
        self.assertArrayEqual(
            dsr.second,
            [50, 50, 72, 50, 162, 162, 145, 162, 260, 260, 376, 145])

        # On shares and second with filter at groupby construction
        dsr = dat.groupby('order_id', filter=f)[['shares', 'second']].cumsum()
        inv = INVALID_DICT[dsr.shares[0].dtype.num]
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, inv, 0, inv, 100, inv, 100, inv, 400, inv, 550, inv])
        self.assertArrayEqual(
            dsr.second,
            [50, inv, 72, inv, 162, inv, 145, inv, 260, inv, 376, inv])

        # Using agg function
        dsr = gb[['second', 'shares']].agg(['sum', 'mean'])
        self.assertEqual(dsr.shape, (2, 2))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Sum.shares, [800, 1100])
        self.assertArrayAlmostEqual(dsr.Mean.second, [82.00, 96.57], places=2)
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Check for issue when bracket indexing on groupby
        f = open(os.devnull, 'w')
        print(gb, file=f)
        f.close()
        dsr = gb[['second', 'shares']].agg(['sum', 'mean'])

        # Using different functions on different columns
        dsr = gb.agg({'second': 'sum', 'shares': ['max', 'mean']})
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Using numpy functions
        dsr = gb.agg({'second': np.sum, 'shares': [np.max, np.mean]})
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Alternate way to add to multiset
        gb = dat.groupby('order_id')
        ms = gb[['shares']].agg(['max', 'mean'])
        ms.Sum = gb[['second']].sum()
        self.assertEqual(ms.shape, (2, 3))
        self.assertArrayEqual(ms.Sum.second, [410, 676])
        self.assertArrayEqual(ms.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(ms.Mean.shares, [160.00, 157.14], places=2)
예제 #29
0
    def test_projections(self):
        num_rows_trade = 1_000_000
        num_symbols = 450
        Trade_Dates = [
            '20180602', '20180603', '20180604', '20180605', '20180606'
        ]
        Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3'])
        np.random.seed(1234)
        ds = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_trade),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_trade)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)]
                for i in range(num_rows_trade)
            ],
            'Time': [
                int(i % (num_rows_trade / len(Trade_Dates)))
                for i in range(num_rows_trade)
            ],
            'Price':
            100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)),
            'Size':
            10 *
            np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64),
        })
        num_rows_quote = 1_000_000
        ds2 = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_quote),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_quote)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)]
                for i in range(num_rows_quote)
            ],
            'Time': [
                int(i % (num_rows_quote / len(Trade_Dates)))
                for i in range(num_rows_quote)
            ],
            'Bid':
            100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
            'Ask':
            100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
        })
        threshold = Dataset(
            {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75})
        trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)})
        trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False)

        # Create GroupBy and corresponding Categorical
        trade_gb = trades.groupby(
            ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500'])
        trade_cat = Categorical(
            [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500])

        # Call sum() and count()
        self.assertEqual(trade_gb.sum().shape, (455654, 7))
        self.assertEqual(trade_cat.sum(ds).shape, (455654, 7))
        self.assertEqual(trade_gb.count().shape, (455654, 5))
        # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys
        self.assertEqual(trade_cat.count().shape, (455654, 5))
        b1 = trade_gb.count().Count.mean()
        b1c = trade_cat.count().Count.mean()
        b2 = trade_gb.count().shape[0]
        self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5)
        self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5)

        # Create ds augmented with filtered ID
        trade_ds = Dataset({'ID': trade_gb.grouping.ikey})
        trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold
        trade_ds_below_thresholdb = Dataset.concat_columns(
            [trade_ds_below_threshold, trade_ds], do_copy=False)

        # Create trade_ds size projection using GroupBy
        trade_gb_id = trade_ds_below_thresholdb.groupby('ID')
        trade_sizes_ds = trade_gb_id['Size'].sum()
        trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1]
        self.assertEqual(trade_size_ds.shape[0], ds.shape[0])

        # Create trade_ds size projection using Categorical
        trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size)
        trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6)

        # Create trade_ds size projection using Pandas groupby
        ptrade_ds_below_thresholdb = dataset_as_pandas_df(
            trade_ds_below_thresholdb)
        ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID')
        trade_sizes_pd_ds = ptrade_gb_id.sum()
        trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)
예제 #30
0
    np.int16,
    np.uint16,
    np.int32,
    np.uint32,
    np.int64,
    np.uint64,
    np.float32,
    np.float64,
]
arr_types_string = [np.bytes_, np.str_]
test_data = {'bool': np.array([True, False, True, False, True], dtype=np.bool)}
for dt in arr_types + arr_types_string:
    test_data[dt.__name__] = np.array(num_list, dtype=dt)
test_data['categorical'] = Categorical([str(i) for i in num_list])
all_headers = list(test_data.keys())
ds = Dataset(test_data)
gb_funcs = ['sum', 'mean', 'first', 'last', 'median', 'min', 'max', 'var']
gb_nan_funcs = ['nansum', 'nanmean', 'nanmedian',
                'nanvar']  #'rolling', 'cumsum', 'nth'


class Groupby_Test(unittest.TestCase):
    def test_math_ops_same_return(self):
        result_dict = {
            'sum': [5, 10],
            'nansum': [5, 10],
            'median': [2.5, 3],
            # TODO: add support for min / max on strings
            'min': [1, 2],
            'max': [4, 5],
        }