def dfs_arrow(tmp_path_factory): tmpdir = tmp_path_factory.mktemp("vaex") path = str(tmpdir / 'strings.hdf5') df = vaex.from_arrays(s=vaex.string_column(string_list), sr=vaex.string_column(string_list_reverse)) df.export(path) # we write it out so that the memory is read only return vaex.open(path)
def create_base_ds(): dataset = vaex.dataset.DatasetArrays("dataset") x = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0] y = y = x**2 ints = np.arange(-2, 19, dtype="i8") ints[0] = 2**62 + 1 ints[1] = -2**62 + 1 ints[2] = -2**62 - 1 ints[0 + 10] = 2**62 + 1 ints[1 + 10] = -2**62 + 1 ints[2 + 10] = -2**62 - 1 dataset.add_column("x", x) dataset.add_column("y", y) # m = x.copy() m = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0] ma_value = 77777 m[-1 + 10] = ma_value m[-1 + 20] = ma_value m = np.ma.array(m, mask=m == ma_value) n = x.copy() n[-2 + 10] = np.nan n[-2 + 20] = np.nan nm = x.copy() nm[-2 + 10] = np.nan nm[-2 + 20] = np.nan nm[-1 + 10] = ma_value nm[-1 + 20] = ma_value nm = np.ma.array(nm, mask=nm == ma_value) mi = np.ma.array(m.data.astype(np.int64), mask=m.data == ma_value, fill_value=88888) dataset.add_column("m", m) dataset.add_column('n', n) dataset.add_column('nm', nm) dataset.add_column("mi", mi) dataset.add_column("ints", ints) name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)), dtype='U') #, dtype=np.string_) dataset.add_column("name", np.array(name)) dataset.add_column("name_arrow", vaex.string_column(name)) obj_data = np.array([ 'train', 'false', True, 1, 30., np.nan, 'something', 'something a bit longer resembling a sentence?!', -10000, 'this should be masked' ], dtype='object') obj_mask = np.array([False] * 9 + [True]) obj = nm.copy().astype('object') obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object') dataset.add_column("obj", obj) return dataset # dsf = create_filtered()
def test_arrow_strings(): N = 4 x = ['a', 'bb', 'ccc', 'dddd'] xc = vaex.string_column(x) df = vaex.from_arrays(x=xc) assert len(df.columns['x']) == 4 trimmed = df.columns['x'][2:4] assert trimmed[:].tolist() == x[2:4] assert trimmed[1:2].tolist() == x[3:4] assert len(df) == N assert len(df[1:3]) == 2 assert df[1:3].x.tolist() == x[1:3] indices = np.array([0, 2, 1, 3]) assert xc.take(indices).tolist() == ['a', 'ccc', 'bb', 'dddd'] indices_masked = np.ma.array(indices, mask=[False, True, False, False]) assert xc.take(indices_masked).tolist() == ['a', None, 'bb', 'dddd'] indices = np.array([0, 2, 1, 3]) assert xc.take(indices).tolist() == ['a', 'ccc', 'bb', 'dddd'] mask = np.array([True, True, False, True]) assert vaex.array_types.filter(xc, mask).tolist() == ['a', 'bb', 'dddd'] mask_masked = np.ma.array(np.array([True, True, False, True]), mask=[False, True, True, False]) assert vaex.array_types.filter(xc, mask_masked).tolist() == ['a', 'dddd']
def test_arrow_strings_null(): N = 4 x = ['a', 'bb', None, 'dddd', None] xc = vaex.string_column(x) assert xc.tolist() == x assert xc[1:].tolist() == x[1:] assert xc[2:4].tolist() == x[2:4]
def test_unique_arrow(df_factory): ds = df_factory(x=vaex.string_column( ['a', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a'])) with small_buffer(ds, 2): assert set(ds.unique(ds.x)) == {'a', 'b'} values, index = ds.unique(ds.x, return_inverse=True) assert np.array(values)[index].tolist() == ds.x.tolist()
def test_string_strip_special_case(): strings = [ "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? " "They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. " "And please don't remove the template from the talk page since I'm retired now.89.205.38.27" ] df = vaex.from_arrays(s=vaex.string_column(strings)) df.s.str.strip(' ').values # .get(0)
def test_arrow_strings(): N = 4 x = ['a', 'bb', 'ccc', 'dddd'] xc = vaex.string_column(x) df = vaex.from_arrays(x=xc) assert len(df.columns['x']) == 4 trimmed = df.columns['x'][2:4] assert trimmed[:].tolist() == x[2:4] assert len(df) == N assert len(df[1:3]) == 2 assert df[1:3].x.tolist() == x[1:3]
def test_is_na(): s = vaex.string_column(["aap", None, "noot", "mies"]) o = ["aap", None, False, np.nan] x = np.arange(4, dtype=np.float64) x[2] = x[3] = np.nan m = np.ma.array(x, mask=[0, 1, 0, 1]) df = vaex.from_arrays(x=x, m=m, s=s, o=o) assert (df.x.isna().tolist() == [False, False, True, True]) assert (df.m.isna().tolist() == [False, True, True, True]) assert (df.s.isna().tolist() == [False, True, False, False]) assert (df.o.isna().tolist() == [False, True, False, True])
def test_string_count_stat(): ds = vaex.from_arrays(names=['hello', 'this', 'is', 'long']) assert ds.count(ds.names) == 4 ds = vaex.from_arrays(names=np.ma.array(['hello', 'this', 'is', 'long'], mask=[0, 0, 1, 0])) assert ds.count(ds.names) == 3 df = vaex.from_arrays(names=np.array(['hi', 'is', 'l2', np.nan], dtype='O')) assert df.count(ds.names) == 3 names = vaex.string_column(['hello', 'this', None, 'long']) x = np.arange(len(names)) df = vaex.from_arrays(names=names, x=x) assert df.count(df.names, binby='x', limits=[0, 100], shape=1).tolist() == [3]
def test_concat_mixed_types(): x1 = np.zeros(3) + np.nan x2 = vaex.string_column(['hi', 'there']) df1 = vaex.from_arrays(x=x1) df2 = vaex.from_arrays(x=x2) df = vaex.concat([df1, df2]) assert df2.x.dtype == df.x.dtype, "expect 'upcast' to string" assert df[:2].x.tolist() == ['nan', 'nan'] assert df[1:4].x.tolist() == ['nan', 'nan', 'hi'] assert df[2:4].x.tolist() == ['nan', 'hi'] assert df[3:4].x.tolist() == ['hi'] assert df[3:5].x.tolist() == ['hi', 'there']
def test_unique_categorical(df_factory, future): df = df_factory(x=vaex.string_column(['a', 'c', 'b', 'a', 'a'])) df = df.ordinal_encode('x') df = df._future() if future else df if future: assert df.x.dtype == str assert set(df.x.unique()) == {'a', 'b', 'c'} assert df.x.nunique() == 3 else: assert df.x.dtype == int assert set(df.x.unique()) == {0, 1, 2} assert df.x.nunique() == 3
def test_null_values(): df = vaex.from_arrays(s=vaex.string_column(['aap', None, 'mies']), x=[0, 1, 2]) assert df.count() == 3 assert df.count(df.s) == 2 assert df.count(df.s, selection=df.x > 0) == 1
def dfs_arrow(): return vaex.from_arrays(s=vaex.string_column(string_list), sr=vaex.string_column(string_list_reverse))
def create_base_ds(): x = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0] columns = {'x': x} y = y = x**2 ints = np.arange(-2, 19, dtype="i8") ints[0] = 2**62 + 1 ints[1] = -2**62 + 1 ints[2] = -2**62 - 1 ints[0 + 10] = 2**62 + 1 ints[1 + 10] = -2**62 + 1 ints[2 + 10] = -2**62 - 1 columns["x"] = x columns["y"] = y # m = x.copy() m = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0] ma_value = 77777 m[-1 + 10 + 2] = ma_value m[-1 + 20] = ma_value m = np.ma.array(m, mask=m == ma_value) n = x.copy() n[-2 + 10] = np.nan n[-2 + 20] = np.nan nm = x.copy() nm[-2 + 10] = np.nan nm[-2 + 20] = np.nan nm[-1 + 10] = ma_value nm[-1 + 20] = ma_value nm = np.ma.array(nm, mask=nm == ma_value) mi = np.ma.array(m.data.astype(np.int64), mask=m.data == ma_value, fill_value=88888) columns["m"] = m columns['n'] = n columns['nm'] = nm columns["mi"] = mi columns["ints"] = ints name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)), dtype='U') #, dtype=np.string_) columns["name"] = np.array(name) columns["name_arrow"] = vaex.string_column(name) obj_data = np.array([ 'train', 'false', True, 1, 30., np.nan, 'something', 'something a bit longer resembling a sentence?!', -10000, 'this should be masked' ], dtype='object') obj_mask = np.array([False] * 9 + [True]) obj = nm.copy().astype('object') obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object') columns["obj"] = obj #, dtype=np.dtype('O') booleans = np.ones(21, dtype=np.bool) booleans[[4, 6, 8, 14, 16, 19]] = False columns["bool"] = booleans datetime = np.array([ '2016-02-29T22:02:02.32', '2013-01-17T01:02:03.32', '2017-11-11T08:15:15.00', '1995-04-01T05:55:55.55', '2000-01-01T00:00:00.00', '2019-03-05T09:12:13.51', '1993-10-15T17:23:47.00', '2001-09-15T00:00:00.15', '2019-02-18T13:12:10.09', '1991-07-12T16:17:33.11', '2005-05-05T05:05:05.05', '2011-08-27T03:06:15.00', '1999-07-09T09:01:33.21', '2018-04-04T17:30:00.00', '2012-12-01T21:00:00.01', '1994-05-02T11:22:33.00', '2003-07-02T22:33:00.00', '2014-06-03T06:30:00.00', '1997-09-04T20:31:00.11', '2004-02-24T04:00:00.00', '2000-06-15T12:30:30.00', ], dtype=np.datetime64) timedelta = datetime - np.datetime64('1996-05-17T16:45:00.00') columns["datetime"] = datetime columns["timedelta"] = timedelta columns["123456"] = x # a column that will have an alias df = vaex.from_arrays(**columns) df.add_virtual_column("z", "x+t*y") df.set_variable("t", 1.) return df._readonly()
def create_base_ds(): dataset = vaex.dataset.DatasetArrays("dataset") x = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0] y = y = x**2 ints = np.arange(-2, 19, dtype="i8") ints[0] = 2**62 + 1 ints[1] = -2**62 + 1 ints[2] = -2**62 - 1 ints[0 + 10] = 2**62 + 1 ints[1 + 10] = -2**62 + 1 ints[2 + 10] = -2**62 - 1 dataset.add_column("x", x) dataset.add_column("y", y) # m = x.copy() m = np.arange(-2, 40, dtype=">f8").reshape((-1, 21)).T.copy()[:, 0] ma_value = 77777 m[-1 + 10] = ma_value m[-1 + 20] = ma_value m = np.ma.array(m, mask=m == ma_value) n = x.copy() n[-2 + 10] = np.nan n[-2 + 20] = np.nan nm = x.copy() nm[-2 + 10] = np.nan nm[-2 + 20] = np.nan nm[-1 + 10] = ma_value nm[-1 + 20] = ma_value nm = np.ma.array(nm, mask=nm == ma_value) mi = np.ma.array(m.data.astype(np.int64), mask=m.data == ma_value, fill_value=88888) dataset.add_column("m", m) dataset.add_column('n', n) dataset.add_column('nm', nm) dataset.add_column("mi", mi) dataset.add_column("ints", ints) name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)), dtype='U') #, dtype=np.string_) dataset.add_column("name", np.array(name)) dataset.add_column("name_arrow", vaex.string_column(name)) obj_data = np.array([ 'train', 'false', True, 1, 30., np.nan, 'something', 'something a bit longer resembling a sentence?!', -10000, 'this should be masked' ], dtype='object') obj_mask = np.array([False] * 9 + [True]) obj = nm.copy().astype('object') obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object') dataset.add_column("obj", obj) booleans = np.ones(21, dtype=np.bool) booleans[[4, 6, 8, 14, 16, 19]] = False dataset.add_column("bool", booleans) datetime = np.array([ '2016-02-29T22:02:02.32', '2013-01-17T01:02:03.32', '2017-11-11T08:15:15.00', '1995-04-01T05:55:55.55', '2000-01-01T00:00:00.00', '2019-03-05T09:12:13.51', '1993-10-15T17:23:47.00', '2001-09-15T00:00:00.15', '2019-02-18T13:12:10.09', '1991-07-12T16:17:33.11', '2005-05-05T05:05:05.05', '2011-08-27T03:06:15.00', '1999-07-09T09:01:33.21', '2018-04-04T17:30:00.00', '2012-12-01T21:00:00.01', '1994-05-02T11:22:33.00', '2003-07-02T22:33:00.00', '2014-06-03T06:30:00.00', '1997-09-04T20:31:00.11', '2004-02-24T04:00:00.00', '2000-06-15T12:30:30.00', ], dtype=np.datetime64) timedelta = datetime - np.datetime64('1996-05-17T16:45:00.00') dataset.add_column("datetime", datetime) dataset.add_column("timedelta", timedelta) return dataset._readonly()
def create_base_ds(): dataset = vaex.dataset.DatasetArrays("dataset") x = np.arange(-2, 40, dtype=">f8").reshape((-1,21)).T.copy()[:,0] y = y = x ** 2 ints = np.arange(-2,19, dtype="i8") ints[0] = 2**62+1 ints[1] = -2**62+1 ints[2] = -2**62-1 ints[0+10] = 2**62+1 ints[1+10] = -2**62+1 ints[2+10] = -2**62-1 dataset.add_column("x", x) dataset.add_column("y", y) # m = x.copy() m = np.arange(-2, 40, dtype=">f8").reshape((-1,21)).T.copy()[:,0] ma_value = 77777 m[-1+10] = ma_value m[-1+20] = ma_value m = np.ma.array(m, mask=m==ma_value) n = x.copy() n[-2+10] = np.nan n[-2+20] = np.nan nm = x.copy() nm[-2+10] = np.nan nm[-2+20] = np.nan nm[-1+10] = ma_value nm[-1+20] = ma_value nm = np.ma.array(nm, mask=nm==ma_value) mi = np.ma.array(m.data.astype(np.int64), mask=m.data==ma_value, fill_value=88888) dataset.add_column("m", m) dataset.add_column('n', n) dataset.add_column('nm', nm) dataset.add_column("mi", mi) dataset.add_column("ints", ints) name = np.array(list(map(lambda x: str(x) + "bla" + ('_' * int(x)), x)), dtype='U') #, dtype=np.string_) dataset.add_column("name", np.array(name)) dataset.add_column("name_arrow", vaex.string_column(name)) obj_data = np.array(['train', 'false' , True, 1, 30., np.nan, 'something', 'something a bit longer resembling a sentence?!', -10000, 'this should be masked'], dtype='object') obj_mask = np.array([False] * 9 + [True]) obj = nm.copy().astype('object') obj[2:12] = np.ma.MaskedArray(data=obj_data, mask=obj_mask, dtype='object') dataset.add_column("obj", obj) booleans = np.ones(21, dtype=np.bool) booleans[[4, 6, 8, 14, 16, 19]] = False dataset.add_column("bool", booleans) datetime = np.array(['2016-02-29T22:02:02.32', '2013-01-17T01:02:03.32', '2017-11-11T08:15:15.00', '1995-04-01T05:55:55.55', '2000-01-01T00:00:00.00', '2019-03-05T09:12:13.51', '1993-10-15T17:23:47.00', '2001-09-15T00:00:00.15', '2019-02-18T13:12:10.09', '1991-07-12T16:17:33.11', '2005-05-05T05:05:05.05', '2011-08-27T03:06:15.00', '1999-07-09T09:01:33.21', '2018-04-04T17:30:00.00', '2012-12-01T21:00:00.01', '1994-05-02T11:22:33.00', '2003-07-02T22:33:00.00', '2014-06-03T06:30:00.00', '1997-09-04T20:31:00.11', '2004-02-24T04:00:00.00', '2000-06-15T12:30:30.00', ],dtype=np.datetime64) timedelta = datetime - np.datetime64('1996-05-17T16:45:00.00') dataset.add_column("datetime", datetime) dataset.add_column("timedelta", timedelta) return dataset._readonly()
def test_string_strip_special_case2(): strings = [ 'The eunuch in question left me no choice but to reinsert it. Take action as you see fit.·snunɐw·' ] df = vaex.from_arrays(s=vaex.string_column(strings)) assert df.s.str.upper().tolist() == df.s.str_pandas.upper().tolist()
def test_string_strip_special_case2(): strings = ['ɐa', 'aap'] df = vaex.from_arrays(s=vaex.string_column(strings)) assert df.s.str.capitalize().tolist() == df.s.str_pandas.capitalize( ).tolist()
def test_concat_arrow_strings(): df1 = vaex.from_arrays(x=vaex.string_column(['aap', 'noot', 'mies'])) df2 = vaex.from_arrays(x=vaex.string_column(['a', 'b', 'c'])) df = vaex.concat([df1, df2]) assert df.data_type('x') == df1.data_type('x') assert df.x.tolist() == ['aap', 'noot', 'mies', 'a', 'b', 'c']