Пример #1
0
    def test_apply(self):
        arrsize = 200
        numrows = 7

        ds = Dataset({'time': arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM']
        ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols)
        ds.accum2('symbol', 'data').sum(ds.data2)
        ds.accum2('symbol', 'data').sum(ds.data2, showfilter=True)
        ds.accum2('symbol', 'data').median(ds.data2, showfilter=True)
        ds.accum2('symbol', 'data').median(ds.data2, showfilter=False)
        ds.accum2('symbol', 'data').apply_reduce(np.median,
                                                 ds.data2,
                                                 showfilter=True)
        ds.accum2('symbol', 'data').apply_reduce(np.median,
                                                 ds.data2,
                                                 showfilter=False)
        f = logical(arange(200) % 2)
        ds.accum2('symbol', 'data').apply_reduce(np.median, ds.data2, filter=f)
        ds.accum2('symbol', 'data').apply_reduce(np.median,
                                                 ds.data2,
                                                 filter=f,
                                                 showfilter=True)
        ds.accum2('symbol', 'data').median(ds.data2, filter=f, showfilter=True)
Пример #2
0
    def test_simple_cats(self):
        data = arange(1, 6) * 10
        colnames = FastArray(['a', 'b', 'c', 'd', 'e'])
        c1 = Categorical(colnames)
        c2 = Categorical(arange(5))

        # no filter
        ac = Accum2(c2, c1)
        result = ac.sum(data)
        self.assertEqual(result._ncols, 7)
        for i, colname in enumerate(colnames):
            arr = result[colname]
            self.assertEqual(arr[i], data[i])
Пример #3
0
    def test_simple_cats_filter_accum(self):
        data = arange(1, 6) * 10
        colnames = FastArray(['a', 'b', 'c', 'd', 'e'])
        c1 = Categorical(colnames)
        c2 = Categorical(arange(5))

        # filtered accum object
        ac = Accum2(c2, c1, showfilter=True)
        result = ac.sum(data)
        self.assertEqual(result._ncols, 8)
        for i, colname in enumerate(colnames):
            arr = result[colname]
            self.assertEqual(arr[i + 1], data[i])
Пример #4
0
 def test_accum2_median(self):
     ds = Dataset({'time': arange(200.0)})
     ds.data = np.random.randint(7, size=200)
     ds.data2 = np.random.randint(7, size=200)
     symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM']
     ds.symbol = Cat(1 + arange(200) % 5, symbols)
     ac = Accum2(ds.data, ds.symbol).median(ds.time)
     totalcol = ac[ac.summary_get_names()[0]]
     footer = ac.footer_get_values()['Median']
     for i in range(len(symbols)):
         s_median = ds[ds.symbol == symbols[i], :].time.median()
         self.assertEqual(footer[i + 1], s_median)
     for i in range(7):
         s_median = ds[ds.data == i, :].time.median()
         self.assertEqual(totalcol[i], s_median)
Пример #5
0
    def test_col_moves(self):
        st = Struct(
            {
                _k: list(range(_i * 10, (_i + 1) * 10))
                for _i, _k in enumerate('abcdefghijklmnop')
            }
        )

        st.col_move_to_front(1)
        self.assertEqual(list(st), list('bacdefghijklmnop'))
        st.col_move_to_front(1)

        st.col_move_to_back(14)
        self.assertEqual(list(st), list('abcdefghijklmnpo'))
        st.col_move_to_back(14)

        with self.assertRaises(ValueError):
            st.col_move_to_front(arange(20))

        st.col_move_to_back(list('dgh'))
        self.assertEqual(list(st), list('abcefijklmnopdgh'))
        st.col_move_to_front(list('gpha'))
        self.assertEqual(list(st), list('gphabcefijklmnod'))
        st.col_move(list('cim'), list('hfo'))
        self.assertEqual(list(st), list('cimgpabejklndhfo'))
        st.col_move_to_front({'g': 1})
        st.col_move_to_front('h')
        with self.assertWarns(UserWarning):
            st.col_move_to_front('q')
        self.assertEqual(list(st), list('hgcimpabejklndfo'))
        st.col_move_to_back({'g': 1})
        st.col_move_to_back('h')
        with self.assertWarns(UserWarning):
            st.col_move_to_back('q')
        self.assertEqual(list(st), list('cimpabejklndfogh'))
Пример #6
0
    def test_qcut(self):
        c = qcut(arange(10), 3)
        self.assertTrue(sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4])) == 0)

        c = qcut(arange(11), 3)
        self.assertTrue(
            sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4])) == 0)

        c = qcut(range(5), 3, labels=["good", "medium", "bad"])
        self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 4])) == 0)

        c = cut(
            FA([2, 4, 6, 8, 10]),
            FA([0, 2, 4, 6, 8, 10]),
            labels=['a', 'b', 'c', 'd', 'e'],
        )
Пример #7
0
    def test_multikey_cats_filter_accum_ordered(self):
        unsorted_str = FastArray(['c', 'e', 'b', 'd', 'a'])
        ints = arange(1, 6) * 10
        data = np.random.rand(5) * 10

        # unsorted filter accum object
        c1 = Categorical([unsorted_str, ints])
        c2 = Categorical([unsorted_str, ints])
        ac = Accum2(c2, c1)
        result = ac.sum(data, showfilter=True)
        self.assertEqual(result._ncols, 9)
        for i, key1 in enumerate(unsorted_str):
            k1 = bytes.decode(key1)
            k2 = ints[i]
            full_colname = "('" + k1 + "', " + str(k2) + ")"
            arr = result[full_colname]
            self.assertEqual(arr[i + 1], data[i])

        # sorted filter accum object
        sortidx = np.argsort(unsorted_str)
        sorted_str = unsorted_str[sortidx]
        sorted_ints = ints[sortidx]
        sorted_data = data[sortidx]
        c1 = Categorical([unsorted_str, ints], ordered=True)
        c2 = Categorical([unsorted_str, ints], ordered=True)
        ac = Accum2(c2, c1)
        result = ac.sum(data, showfilter=True)
        self.assertEqual(result._ncols, 9)
        for i, key1 in enumerate(sorted_str):
            k1 = bytes.decode(key1)
            k2 = sorted_ints[i]
            full_colname = "('" + k1 + "', " + str(k2) + ")"
            arr = result[full_colname]
            self.assertEqual(arr[i + 1], sorted_data[i])
Пример #8
0
    def test_ismember_categorical_numeric(self):
        c = Categorical([1, 2, 3, 1, 2, 3, 1, 2, 4])
        f = FastArray([1, 2, 3], dtype=np.int64)
        b, idx = ismember(c, f)
        self.assertTrue(bool(np.all(b[:-1])))
        self.assertFalse(b[-1], False)
        self.assertTrue(bool(np.all(idx[:-1] == tile(FA([0, 1, 2]), 3)[:-1])))
        self.assertTrue(idx.isnan()[-1])
        f = FastArray(['a', 'b', 'c'])
        with pytest.raises(TypeError):
            b, idx = ismember(c, f)

        c = Categorical([1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 4.0])
        f = FastArray([1, 2, 3], dtype=np.float64)
        b, idx = ismember(c, f)
        self.assertTrue(bool(np.all(b[:-1])))
        self.assertFalse(b[-1], False)
        self.assertTrue(bool(np.all(idx[:-1] == tile(FA([0, 1, 2]), 3)[:-1])))
        self.assertTrue(idx.isnan()[-1])
        f = FastArray(['a', 'b', 'c'])
        with pytest.raises(TypeError):
            b, idx = ismember(c, f)

        c = Categorical([np.random.choice(['a', 'b', 'c'], 10), arange(10)])
        with pytest.raises(TypeError):
            b, idx = ismember(c, f)
Пример #9
0
    def test_col_ctor_02(self):
        inv_keys = ['True', 'False', 'None']
        arr = arange(5)
        inv_dict = {k: arr for k in inv_keys}
        with self.assertWarns(UserWarning):
            st = Struct(inv_dict)

        self.assertTrue(bool(np.all(inv_keys == list(st))))
        for k in inv_keys:
            self.assertTrue(bool(np.all(st[k] == arr)))
Пример #10
0
    def test_accum2(self):
        c = cut(arange(10), 3)
        self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0)

        c = cut(arange(10.0), 3)
        self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0)

        c = cut(arange(11), 3)
        self.assertTrue(
            sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3])) == 0)

        c = cut(FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10]))
        self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0)

        c = cut(
            FA([2, 4, 6, 8, 10]),
            FA([0, 2, 4, 6, 8, 10]),
            labels=['a', 'b', 'c', 'd', 'e'],
        )
        self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0)
Пример #11
0
    def test_concat(self):
        st1 = Struct({
            'ds':
            TypeRegister.Dataset(
                {'col_' + str(i): np.random.rand(5)
                 for i in range(5)}),
            'arr':
            arange(5),
            'cat':
            TypeRegister.Categorical(['a', 'a', 'b', 'c', 'a']),
        })
        st2 = Struct({
            'ds':
            TypeRegister.Dataset(
                {'col_' + str(i): np.random.rand(5)
                 for i in range(5)}),
            'arr':
            arange(5),
            'cat':
            TypeRegister.Categorical(['a', 'a', 'b', 'c', 'a']),
        })

        result = Struct.concat_structs([st1, st2])
        self.assertTrue(isinstance(result.ds, TypeRegister.Dataset))
        self.assertTrue(isinstance(result.arr, TypeRegister.FastArray))
        self.assertTrue(isinstance(result.cat, TypeRegister.Categorical))

        correct_arr = np.hstack([st1.arr, st2.arr])
        self.assertTrue(bool(np.all(correct_arr == result.arr)))

        correct_cat = np.array(
            ['a', 'a', 'b', 'c', 'a', 'a', 'a', 'b', 'c', 'a'])
        self.assertTrue(bool(np.all(correct_cat == result.cat)))

        for c in st1.ds:
            correct = np.hstack([st1.ds[c], st2.ds[c]])
            self.assertTrue(bool(np.all(correct == result.ds[c])))

        order = list(st1.keys())
        result = list(result.keys())
        self.assertTrue(bool(np.all(order == result)))
Пример #12
0
    def test_accum2_nanmedian_with_filter(self):
        ds = Dataset({'time': arange(200.0)})
        ds.data = np.random.randint(7, size=200)
        ds.data2 = np.random.randint(7, size=200)
        symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM']
        # N.B. make a copy here for testing
        symbol_categorical = Cat(1 + arange(200) % 5, symbols)
        # N.B. Categorical.copy and Categorical constructor doesn't do deep copy?!
        ds.symbol = Cat(1 + arange(200) % 5, symbols)

        chosen_symbols = ['AMZN', 'AAPL']
        filt = symbol_categorical.isin(chosen_symbols)
        ac = Accum2(ds.data, ds.symbol)
        stat1 = ac.nanmedian(ds.time, filter=filt)
        totalcol = stat1[stat1.summary_get_names()[0]]
        footer = stat1.footer_get_values()['Median']
        # Make sure we don't change the input data
        self.assertTrue(not rt.any(ds.symbol._fa == 0))
        for sym in chosen_symbols:
            s_median = rt.nanmedian(ds[symbol_categorical == sym, :].time)
            i = rt.where(symbol_categorical.category_array == sym)[0].item()
            self.assertEqual(footer[i + 1], s_median)
        for i in range(7):
            s_median = rt.nanmedian(ds[(ds.data == i) & filt, :].time)
            self.assertEqual(totalcol[i], s_median)

        chosen_symbols = ['IBM', 'FB']
        filt = symbol_categorical.isin(chosen_symbols)
        stat2 = ac.nanmedian(ds.time, filter=filt)
        totalcol = stat2[stat2.summary_get_names()[0]]
        footer = stat2.footer_get_values()['Median']
        # Make sure we don't change the input data
        self.assertTrue(not rt.any(ds.symbol._fa == 0))
        for sym in chosen_symbols:
            s_median = rt.nanmedian(ds[symbol_categorical == sym, :].time)
        i = rt.where(symbol_categorical.category_array == sym)[0].item()
        self.assertEqual(footer[i + 1], s_median)
        for i in range(7):
            s_median = rt.nanmedian(ds[(ds.data == i) & filt, :].time)
        self.assertEqual(totalcol[i], s_median)
Пример #13
0
    def test_apply_nonreduce(self):
        arrsize = 200
        numrows = 7
        ds = rt.Dataset({'time': rt.arange(arrsize * 1.0)})
        ds.data = arange(arrsize) % numrows
        ds.data2 = (arange(arrsize) + 3) % numrows
        symbols = [
            'AAPL',
            'AMZN',
            'FB',
            'GOOG',
            'IBM',
            '6',
            '7',
            '8',
            '9',
            '10',
            '11',
            '12',
            '13',
            '14',
            '15',
            '16',
            '17',
            '18',
        ]
        ds.symbol = rt.Cat(1 + rt.arange(arrsize) % len(symbols), symbols)
        result = ds.symbol.apply_reduce(lambda x, y: np.sum(np.minimum(x, y)),
                                        (ds.data, ds.data))

        ac = ds.accum2('symbol', 'data')
        newds = ac.apply_nonreduce(np.cumsum)
        ds2 = ac.apply_reduce(lambda x, y: np.sum(np.maximum(x, y)),
                              (newds.data, newds.data2))

        x = np.maximum(newds.data, newds.data2)
        y = ac.apply_nonreduce(lambda x, y: np.maximum(x, y),
                               (newds.data, newds.data2))[0]
        self.assertTrue(np.all(x == y))
Пример #14
0
    def test_showfilter_label_subclass(self):
        d = Date.range('20190201', '20190210')
        c = Categorical(d)
        c2 = Categorical(arange(10))
        ac = Accum2(c, c2)
        result = ac.count(showfilter=True)

        self.assertTrue(isinstance(result.YLabel, Date))
        self.assertTrue(result.YLabel.isnan()[0])

        d = DateTimeNano.random(10)
        c = Categorical(d)
        c2 = Categorical(arange(10))
        ac = Accum2(c, c2)
        result = ac.count(showfilter=True)

        self.assertTrue(isinstance(result.YLabel, DateTimeNano))
        self.assertTrue(result.YLabel.isnan()[0])

        d = DateSpan(arange(10, 20))
        c = Categorical(d)
        c2 = Categorical(arange(10))
        ac = Accum2(c, c2)
        result = ac.count(showfilter=True)

        self.assertTrue(isinstance(result.YLabel, DateSpan))
        self.assertTrue(result.YLabel.isnan()[0])

        d = TimeSpan(np.random.rand(10) * 10_000_000_000)
        c = Categorical(d)
        c2 = Categorical(arange(10))
        ac = Accum2(c, c2)
        result = ac.count(showfilter=True)

        self.assertTrue(isinstance(result.YLabel, TimeSpan))
        self.assertTrue(result.YLabel.isnan()[0])
Пример #15
0
def test_ismember_categorical():
    for b_index_c in [0, 1]:
        for b_index_d in [0, 1]:

            # string values, both base indices
            c = TypeRegister.Categorical(
                np.random.choice(['a', 'b', 'c', 'd', 'e', 'f'], 15),
                base_index=b_index_c,
            )
            d = TypeRegister.Categorical(np.random.choice(['a', 'b', 'c'], 10),
                                         base_index=b_index_d)
            cs, ds = c.as_string_array, d.as_string_array

            b, f = ismember(c, d)
            bs, fs = ismember(cs, ds)
            assert_array_equal(b, bs)
            assert_array_equal(int8(f), fs)

            b, f = ismember(d, c)
            bs, fs = ismember(ds, cs)
            assert_array_equal(b, bs)
            assert_array_equal(int8(f), fs)

            # codes, string values, both base indices

        c = TypeRegister.Categorical(np.random.choice(
            ['a', 'b', 'c', 'd', 'e', 'f'], 15),
                                     base_index=b_index_c)
        d = TypeRegister.Categorical(np.random.choice(['a', 'b', 'c'], 10),
                                     ['a', 'b', 'c'],
                                     base_index=1)
        cs, ds = c.as_string_array, d.as_string_array
        b, f = ismember(c, d)
        bs, fs = ismember(cs, ds)
        assert_array_equal(b, bs)
        assert_array_equal(int8(f), fs)

        b, f = ismember(d, c)
        bs, fs = ismember(ds, cs)
        assert_array_equal(b, bs)
        assert_array_equal(int8(f), fs)

    c = Categorical(np.random.choice(['a', 'b', 'c'], 15))
    with pytest.raises(TypeError):
        b, idx = ismember(c, arange(3))
Пример #16
0
def test_ismember_align_multikey():
    correct_bool = FastArray([True, True, True, False, False])
    correct_idx = FastArray([0, 1, 2, int8.inv, int8.inv], dtype=np.int8)

    # bytes / unicode both upcast
    a_keys = [arange(5), FastArray([b'a', b'b', b'c', b'd', b'e'], dtype='S5')]
    b_keys = [arange(3), FastArray(['a', 'b', 'c'], dtype='U4', unicode=True)]
    b, idx = ismember(a_keys, b_keys)
    assert_array_equal(b, correct_bool)
    # NOTE: flip to numpy because FastArray is sentinel-aware
    assert_array_equal(idx._np, correct_idx._np)
    assert a_keys[1].dtype.char == 'S'

    # bytes / Categorical unicode
    a_keys = [arange(5), FastArray(['a', 'b', 'c', 'd', 'e'], dtype='S5')]
    b_keys = [
        arange(3),
        Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True),
                    unicode=True),
    ]
    b, idx = ismember(a_keys, b_keys)
    assert_array_equal(b, correct_bool)
    # NOTE: flip to numpy because FastArray is sentinel-aware
    assert_array_equal(idx._np, correct_idx._np)

    # unicode / Categorical
    a_keys = [
        arange(5),
        FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True)
    ]
    b_keys = [
        arange(3),
        Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True),
                    unicode=True),
    ]
    b, idx = ismember(a_keys, b_keys)
    assert_array_equal(b, correct_bool)
    # NOTE: flip to numpy because FastArray is sentinel-aware
    assert_array_equal(idx._np, correct_idx._np)

    # different numeric types
    a_keys = [
        arange(5, dtype=np.float64),
        FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True),
    ]
    b_keys = [
        arange(3),
        Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True),
                    unicode=True),
    ]
    b, idx = ismember(a_keys, b_keys)
    assert_array_equal(b, correct_bool)
    # NOTE: flip to numpy because FastArray is sentinel-aware
    assert_array_equal(idx._np, correct_idx._np)

    # string / non-string
    a_keys = [
        arange(5).astype('S'),
        FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True),
    ]
    b_keys = [
        arange(3),
        Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True),
                    unicode=True),
    ]
    with pytest.raises(TypeError):
        b, idx = ismember(a_keys, b_keys)

    # multikey categorical, no expand array
    a_keys = [
        arange(5).astype('S'),
        FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True),
    ]
    b_keys = [
        arange(3),
        Categorical(
            [FastArray(['a', 'b', 'c'], dtype='U4', unicode=True),
             arange(3)],
            unicode=True,
        ),
    ]
    with pytest.raises(TypeError):
        b, idx = ismember(a_keys, b_keys)
    with pytest.raises(TypeError):
        b, idx = ismember(b_keys, a_keys)

    # unsupported object array
    a_keys = [
        arange(5).astype('O'),
        FastArray(['a', 'b', 'c', 'd', 'e'], dtype='S5')
    ]
    b_keys = [arange(3), FastArray(['a', 'b', 'c'], dtype='U4', unicode=True)]

    with pytest.raises(TypeError):
        b, idx = ismember(a_keys, b_keys)
Пример #17
0
def test_ismember_int_edges():
    # hit thresholds for a previous bug
    for a_size in [127, 129, 254, 256]:
        a = arange(a_size)
        for b_size in range(129):
            _, _ = ismember(a, arange(b_size))
Пример #18
0
 def test_tree(self):
     '''sanity check that .tree() at least returns something, even for empty Struct'''
     s = Struct()
     self.assertIsInstance(s.tree(), DisplayString)
     s['foo'] = Dataset({'bar': arange(5)})
     self.assertIsInstance(s.tree(), DisplayString)
Пример #19
0
 def test_dataset_accum2(self):
     # test from accum2 off dataset and with a filter
     ds = Dataset({'test': arange(10), 'data': arange(10) // 2})
     x = ds.accum2('data', 'test').sum(ds.test, filter=ds.data == 3)
     totalcol = x.summary_get_names()[0]
     self.assertEqual(x[totalcol][3], 13)