def test_upload_pandas_categorical_ipc(self, con): con.execute("DROP TABLE IF EXISTS test_categorical;") df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) df["B"] = df["A"].astype('category') # test that table created correctly when it doesn't exist on server con.load_table("test_categorical", df) ans = con.execute("select * from test_categorical").fetchall() assert ans == [('a', 'a'), ('b', 'b'), ('c', 'c'), ('a', 'a')] assert con.get_table_details("test_categorical") == [ ColumnDetails( name='A', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ColumnDetails( name='B', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ] # load row-wise con.load_table("test_categorical", df, method="rows") # load columnar con.load_table("test_categorical", df, method="columnar") # load arrow con.load_table("test_categorical", df, method="arrow") # test end result df_ipc = con.select_ipc("select * from test_categorical") assert df_ipc.shape == (16, 2) res = df.append([df, df, df]).reset_index(drop=True) res["A"] = res["A"].astype('category') res["B"] = res["B"].astype('category') assert pd.DataFrame.equals(df_ipc, res) # test that input df wasn't mutated # original input is object, categorical # to load via Arrow, converted internally to object, object assert is_object_dtype(df["A"]) assert is_categorical_dtype(df["B"]) con.execute("DROP TABLE IF EXISTS test_categorical;")
def test_get_table_details(self, con): c = con.cursor() c.execute('drop table if exists stocks;') create = ('create table stocks (date_ text, trans text, symbol text, ' 'qty int, price float, vol float);') c.execute(create) i1 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14,1.1);" # noqa i2 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','GOOG',100,12.14,1.2);" # noqa c.execute(i1) c.execute(i2) result = con.get_table_details('stocks') expected = [ ColumnDetails(name='date_', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ColumnDetails(name='trans', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ColumnDetails(name='symbol', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ColumnDetails(name='qty', type='INT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE'), ColumnDetails(name='price', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE'), ColumnDetails(name='vol', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE') ] assert result == expected c.execute('drop table if exists stocks;')
def get_col_types(col_properties: dict): common_col_params = dict( nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', # is_array=True, ) return [ ColumnDetails(**properties, **common_col_params) for properties in col_properties ]
def test_extract_row_details(self): data = [ TColumnType(col_name='date_', col_type=TTypeInfo(type=6, encoding=4, nullable=True, is_array=False, precision=0, scale=0, comp_param=32), is_reserved_keyword=False, src_name=''), TColumnType(col_name='trans', col_type=TTypeInfo(type=6, encoding=4, nullable=True, is_array=False, precision=0, scale=0, comp_param=32), is_reserved_keyword=False, src_name=''), TColumnType(col_name='symbol', col_type=TTypeInfo(type=6, encoding=4, nullable=True, is_array=False, precision=0, scale=0, comp_param=32), is_reserved_keyword=False, src_name=''), TColumnType(col_name='qty', col_type=TTypeInfo(type=1, encoding=0, nullable=True, is_array=False, precision=0, scale=0, comp_param=0), is_reserved_keyword=False, src_name=''), TColumnType(col_name='price', col_type=TTypeInfo(type=3, encoding=0, nullable=True, is_array=False, precision=0, scale=0, comp_param=0), is_reserved_keyword=False, src_name=''), TColumnType(col_name='vol', col_type=TTypeInfo(type=3, encoding=0, nullable=True, is_array=False, precision=0, scale=0, comp_param=0), is_reserved_keyword=False, src_name='')] result = _extract_column_details(data) expected = [ ColumnDetails(name='date_', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ColumnDetails(name='trans', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ColumnDetails(name='symbol', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ColumnDetails(name='qty', type='INT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='price', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0), ColumnDetails(name='vol', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0) ] assert result == expected
def test_get_table_details(self, con, stocks): result = con.get_table_details('stocks') expected = [ ColumnDetails(name='date_', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ColumnDetails(name='trans', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ColumnDetails(name='symbol', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ColumnDetails(name='qty', type='INT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE'), ColumnDetails(name='price', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE'), ColumnDetails(name='vol', type='FLOAT', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE') ] assert result == expected
def test_build_table_columnar_nulls(self): common_col_params = dict( nullable=True, scale=0, comp_param=0, encoding='NONE', is_array=False, ) col_types = [ ColumnDetails(name='boolean_', type='BOOL', precision=0, **common_col_params), ColumnDetails(name='int_', type='INT', precision=0, **common_col_params), ColumnDetails(name='bigint_', type='BIGINT', precision=0, **common_col_params), ColumnDetails(name='double_', type='DOUBLE', precision=0, **common_col_params), ColumnDetails(name='varchar_', type='STR', precision=0, **common_col_params), ColumnDetails(name='text_', type='STR', precision=0, **common_col_params), ColumnDetails(name='time_', type='TIME', precision=0, **common_col_params), ColumnDetails( name='timestamp_', type='TIMESTAMP', **common_col_params, precision=0, ), ColumnDetails(name='date_', type='DATE', precision=0, **common_col_params), ] data = pd.DataFrame({ 'boolean_': [True, False, None], # Currently Pandas does not support storing None or NaN # in integer columns, so int cols with null # need to be objects. This means our type detection will be # unreliable since if there is no number outside the int32 # bounds in a column with nulls then we will be assuming int 'int_': np.array([0, 1, None], dtype=np.object), 'bigint_': np.array([0, 9223372036854775807, None], dtype=np.object), 'double_': np.array([0, 1, None], dtype=np.float64), 'varchar_': ['a', 'b', None], 'text_': ['a', 'b', None], 'time_': [datetime.time(0, 11, 59), datetime.time(13), None], 'timestamp_': [ pd.Timestamp('2016'), pd.Timestamp('2017'), None, ], 'date_': [ datetime.date(1001, 1, 1), datetime.date(2017, 1, 1), None, ], }) result = _pandas_loaders.build_input_columnar( data, preserve_index=False, col_names=data.columns, col_types=col_types, ) nulls = [False, False, True] bool_na = -128 int_na = -2147483648 bigint_na = -9223372036854775808 ns_na = -9223372037 double_na = 0 expected = [ TColumn(TColumnData(int_col=[1, 0, bool_na]), nulls=nulls), TColumn( TColumnData(int_col=np.array([0, 1, int_na], dtype=np.int32)), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=np.array( [0, 9223372036854775807, bigint_na], dtype=np.int64)), nulls=nulls, ), # noqa TColumn( TColumnData( real_col=np.array([0, 1, double_na], dtype=np.float64)), nulls=nulls, ), # noqa TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls), TColumn(TColumnData(str_col=['a', 'b', '']), nulls=nulls), TColumn(TColumnData(int_col=[719, 46800, bigint_na]), nulls=nulls), TColumn( TColumnData(int_col=[1451606400, 1483228800, ns_na]), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=[-30578688000, 1483228800, bigint_na]), nulls=nulls, ), # noqa ] assert_columnar_equal(result[0], expected)
def test_build_table_columnar_pandas(self): common_col_params = dict( nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', is_array=False, ) col_types = [ ColumnDetails(name='boolean_', type='BOOL', **common_col_params), ColumnDetails(name='smallint_', type='SMALLINT', **common_col_params), ColumnDetails(name='int_', type='INT', **common_col_params), ColumnDetails(name='bigint_', type='BIGINT', **common_col_params), ColumnDetails(name='float_', type='FLOAT', **common_col_params), ColumnDetails(name='double_', type='DOUBLE', **common_col_params), ColumnDetails(name='varchar_', type='STR', **common_col_params), ColumnDetails(name='text_', type='STR', **common_col_params), ColumnDetails(name='time_', type='TIME', **common_col_params), ColumnDetails( name='timestamp_', type='TIMESTAMP', nullable=True, precision=0, scale=0, comp_param=0, encoding='NONE', is_array=False, ), ColumnDetails(name='date_', type='DATE', **common_col_params), ] data = pd.DataFrame({ 'boolean_': [True, False], 'smallint_': np.array([0, 1], dtype=np.int16), 'int_': np.array([0, 1], dtype=np.int32), 'bigint_': np.array([0, 1], dtype=np.int64), 'float_': np.array([0, 1], dtype=np.float32), 'double_': np.array([0, 1], dtype=np.float64), 'varchar_': ['a', 'b'], 'text_': ['a', 'b'], 'time_': [datetime.time(0, 11, 59), datetime.time(13)], 'timestamp_': [pd.Timestamp('2016'), pd.Timestamp('2017')], 'date_': [ datetime.date(2016, 1, 1), datetime.date(2017, 1, 1), ], }) result = _pandas_loaders.build_input_columnar( data, preserve_index=False, col_names=data.columns, col_types=col_types, ) nulls = [False, False] expected = [ TColumn(TColumnData(int_col=[True, False]), nulls=nulls), TColumn( TColumnData(int_col=np.array([0, 1], dtype=np.int16)), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=np.array([0, 1], dtype=np.int32)), nulls=nulls, ), # noqa TColumn( TColumnData(int_col=np.array([0, 1], dtype=np.int64)), nulls=nulls, ), # noqa TColumn( TColumnData(real_col=np.array([0, 1], dtype=np.float32)), nulls=nulls, ), # noqa TColumn( TColumnData(real_col=np.array([0, 1], dtype=np.float64)), nulls=nulls, ), # noqa TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls), TColumn(TColumnData(str_col=['a', 'b']), nulls=nulls), TColumn(TColumnData(int_col=[719, 46800]), nulls=nulls), TColumn(TColumnData(int_col=[1451606400, 1483228800]), nulls=nulls), # noqa TColumn(TColumnData(int_col=[1451606400, 1483228800]), nulls=nulls), ] assert_columnar_equal(result[0], expected)