def test_from_string(): df = conversion_data() df.loc[:, "hex_int"] = df["int"].apply(hex) df.loc[:, "hex_float"] = df["float"].apply(float.hex) data_id, column_type = "1", "type_conversion" i = 0 with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "str_num", "to": "int", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "str_num", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.5) cfg = {"col": "hex_int", "to": "int", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1) cfg = {"col": "hex_float", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.values[0] == 1.5) df = pd.DataFrame( dict( a=[1, 2, 3, "", 5, 6, 7, 8, 9, 10], b=[True, True, False, "", "False", True, False, True, False, True], c=["1", "00", "1.05", " ", " ", "", "02", "..", "none", "nan"], )) with ExitStack() as stack: stack.enter_context( mock.patch("dtale.global_state.DATA", {data_id: df})) cfg = {"col": "a", "to": "float", "from": "mixed-integer"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 51) assert np.isnan(builder.build_column().values[3]) cfg = {"col": "b", "to": "bool", "from": "mixed-integer"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder(builder, lambda col: col.sum() == 5) assert np.isnan(builder.build_column().values[3]) cfg = {"col": "c", "to": "float", "from": "str"} builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), cfg) verify_builder( builder, lambda col: col.sum() == 4.05 and col.isnull().sum() == 6)
def test_zscore_normalize(): def _data(): for i in range(100): yield dict(a=1, i=i) df = pd.DataFrame(list(_data())) data_id, column_type = "1", "zscore_normalize" i = 0 build_data_inst({data_id: df}) builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), {"col": "i"}) verify_builder(builder, lambda col: col.sum() == 4.440892098500626e-16) with pytest.raises(BaseException) as error: builder = ColumnBuilder(data_id, column_type, "Col{}".format(++i), {"col": "a"}) builder.build_column() assert ZERO_STD_ERROR in str(error.value)
def test_type_conversion(unittest): df = pd.DataFrame([{ 'str_num': '1.5', 'str_date': '20200101', 'str_date2': '1/1/2020', 'str_bool': 'True', 'int': 1, 'int_date': 20200101, 'int_s': 1490195805, 'float': 1.5, 'date': pd.Timestamp('20200101'), 'bool': True, 'cat_int': 1, 'cat_bool': 'True', 'cat_str': 'a' }]) for c in ['cat_int', 'cat_bool', 'cat_str']: df.loc[:, c] = df[c].astype('category') data_id, column_type = '1', 'type_conversion' i = 0 with ExitStack() as stack: stack.enter_context( mock.patch('dtale.global_state.DATA', {data_id: df})) cfg = {'col': 'str_num', 'to': 'int', 'from': 'str'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 1 cfg = {'col': 'str_num', 'to': 'float', 'from': 'str'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 1.5 cfg = {'col': 'str_date', 'to': 'date', 'from': 'object'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert pd.Timestamp(s.values[0]).strftime('%Y%m%d') == '20200101' cfg = {'col': 'str_date2', 'to': 'date', 'from': 'object'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert pd.Timestamp(s.values[0]).strftime('%Y%m%d') == '20200101' cfg = {'col': 'str_bool', 'to': 'bool', 'from': 'object'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] cfg = {'col': 'int', 'to': 'float', 'from': 'int'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 1.0 cfg = {'col': 'int', 'to': 'str', 'from': 'int'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == '1' cfg = {'col': 'int', 'to': 'category', 'from': 'int'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.dtype.name == 'category' cfg = {'col': 'int', 'to': 'bool', 'from': 'int'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert isinstance(s.values[0], np.bool_) and np.bool_(True) == s.values[0] cfg = { 'col': 'int_date', 'to': 'date', 'from': 'int', 'unit': 'YYYYMMDD' } builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert pd.Timestamp(s.values[0]).strftime('%Y%m%d') == '20200101' cfg = {'col': 'int_s', 'to': 'date', 'from': 'int', 'unit': 's'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert pd.Timestamp(s.values[0]).strftime('%Y%m%d') == '20170322' cfg = {'col': 'float', 'to': 'int', 'from': 'float'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 1 cfg = {'col': 'float', 'to': 'str', 'from': 'float'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == '1.5' cfg = { 'col': 'date', 'to': 'str', 'from': 'datetime64', 'fmt': '%m/%d/%Y' } builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == '01/01/2020' cfg = { 'col': 'date', 'to': 'int', 'from': 'datetime64', 'unit': 'YYYYMMDD' } builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 20200101 cfg = {'col': 'date', 'to': 'int', 'from': 'datetime64', 'unit': 'ms'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 1577854800 cfg = {'col': 'bool', 'to': 'int', 'from': 'bool'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 1 cfg = {'col': 'bool', 'to': 'str', 'from': 'bool'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 'True' cfg = {'col': 'cat_int', 'to': 'int', 'from': 'category'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 1 cfg = {'col': 'cat_bool', 'to': 'bool', 'from': 'category'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert isinstance(s.values[0], np.bool_) and np.bool_(True) == s.values[0] cfg = {'col': 'cat_str', 'to': 'str', 'from': 'category'} builder = ColumnBuilder(data_id, column_type, 'Col{}'.format(++i), cfg) s = builder.build_column() assert s.values[0] == 'a'