def test_datacolumn_init_with_extension_array(): series_categories = pd.Series([1, 2, 3], dtype='category') extension_categories = pd.Categorical([1, 2, 3]) data_col = DataColumn(extension_categories) series = data_col.to_series() assert series.equals(series_categories) assert series.name is None assert data_col.name is None assert data_col.dtype == 'category' assert data_col.logical_type == Categorical series_ints = pd.Series([1, 2, None, 4], dtype='Int64') extension_ints = pd.arrays.IntegerArray(np.array([1, 2, 3, 4], dtype="int64"), mask=np.array([False, False, True, False])) data_col_with_name = DataColumn(extension_ints, name='extension') series = data_col_with_name.to_series() assert series.equals(series_ints) assert series.name == 'extension' assert data_col_with_name.name == 'extension' series_strs = pd.Series([1, 2, None, 4], dtype='string') data_col_different_ltype = DataColumn(extension_ints, logical_type='NaturalLanguage') series = data_col_different_ltype.to_series() assert series.equals(series_strs) assert data_col_different_ltype.logical_type == NaturalLanguage assert data_col_different_ltype.dtype == 'string'
def test_datacolumn_inity_with_falsy_name(sample_series): falsy_name = 0 warning = 'Name mismatch between sample_series and 0. DataColumn and underlying series name are now 0' with pytest.warns(ColumnNameMismatchWarning, match=warning): dc_falsy_name = DataColumn(sample_series.copy(), name=falsy_name) assert dc_falsy_name.name == falsy_name assert dc_falsy_name.to_series().name == falsy_name
def test_shape(sample_series): col = DataColumn(sample_series) col_shape = col.shape series_shape = col.to_series().shape if dd and isinstance(sample_series, dd.Series): col_shape = (col_shape[0].compute(),) series_shape = (series_shape[0].compute(),) assert col_shape == (4,) assert col_shape == series_shape
def test_datacolumn_init(sample_series): data_col = DataColumn(sample_series, use_standard_tags=False) # Koalas doesn't support category dtype if not (ks and isinstance(sample_series, ks.Series)): sample_series = sample_series.astype('category') pd.testing.assert_series_equal(to_pandas(data_col.to_series()), to_pandas(sample_series)) assert data_col.name == sample_series.name assert data_col.logical_type == Categorical assert data_col.semantic_tags == set()
def test_datacolumn_init_with_name(sample_series, sample_datetime_series): name = 'sample_series' changed_name = 'changed_name' dc_use_series_name = DataColumn(sample_series) assert dc_use_series_name.name == name assert dc_use_series_name.to_series().name == name warning = 'Name mismatch between sample_series and changed_name. DataColumn and underlying series name are now changed_name' with pytest.warns(ColumnNameMismatchWarning, match=warning): dc_use_input_name = DataColumn(sample_series, name=changed_name) assert dc_use_input_name.name == changed_name assert dc_use_input_name.to_series().name == changed_name warning = 'Name mismatch between sample_datetime_series and changed_name. DataColumn and underlying series name are now changed_name' with pytest.warns(ColumnNameMismatchWarning, match=warning): dc_with_ltype_change = DataColumn(sample_datetime_series, name=changed_name) assert dc_with_ltype_change.name == changed_name assert dc_with_ltype_change.to_series().name == changed_name
def test_latlong_formatting(latlongs): expected_series = pd.Series([(1, 2), (3, 4)]) if ks and isinstance(latlongs[0], ks.Series): expected_series = ks.Series([[1, 2], [3, 4]]) elif dd and isinstance(latlongs[0], dd.Series): expected_series = dd.from_pandas(expected_series, npartitions=2) expected_dc = DataColumn(expected_series, logical_type='LatLong', name='test_series') for series in latlongs: dc = DataColumn(series, logical_type='LatLong', name='test_series') pd.testing.assert_series_equal(to_pandas(dc.to_series()), to_pandas(expected_series)) assert dc == expected_dc
def test_datacolumn_equality(sample_series, sample_datetime_series): # Check different parameters to DataColumn str_col = DataColumn(sample_series, logical_type='Categorical') str_col_2 = DataColumn(sample_series, logical_type=Categorical) str_col_diff_tags = DataColumn(sample_series, logical_type=Categorical, semantic_tags={'test'}) diff_name_col = DataColumn(sample_datetime_series, logical_type=Categorical) diff_dtype_col = DataColumn(sample_series, logical_type=NaturalLanguage) diff_description_col = DataColumn(sample_series, logical_type='Categorical', description='description') diff_metadata_col = DataColumn(sample_series, logical_type='Categorical', metadata={'interesting_values': ['a', 'b']}) assert str_col == str_col_2 assert str_col != str_col_diff_tags assert str_col != diff_name_col assert str_col != diff_dtype_col assert str_col != diff_description_col assert str_col != diff_metadata_col # Check columns with same logical types but different parameters ordinal_ltype_1 = Ordinal(order=['a', 'b', 'c']) ordinal_ltype_2 = Ordinal(order=['b', 'a', 'c']) ordinal_col_1 = DataColumn(sample_series, logical_type=ordinal_ltype_1) ordinal_col_2 = DataColumn(sample_series, logical_type=ordinal_ltype_2) assert str_col != ordinal_col_1 assert ordinal_col_1 != ordinal_col_2 assert ordinal_col_1 == ordinal_col_1 datetime_ltype_instantiated = Datetime(datetime_format='%Y-%m%d') datetime_col_format = DataColumn(sample_datetime_series, logical_type=datetime_ltype_instantiated) datetime_col_param = DataColumn(sample_datetime_series, logical_type=Datetime(datetime_format=None)) datetime_col_instantiated = DataColumn(sample_datetime_series, logical_type=Datetime()) datetime_col = DataColumn(sample_datetime_series, logical_type=Datetime) assert datetime_col != datetime_col_instantiated assert datetime_col_instantiated != datetime_col_format assert datetime_col_instantiated == datetime_col_param # Check different underlying series str_col = DataColumn(sample_series, logical_type='NaturalLanguage') changed_series = sample_series.copy().replace(to_replace='a', value='test') null_col = DataColumn(changed_series, logical_type='NaturalLanguage') # We only check underlying data for equality with pandas dataframes if isinstance(str_col.to_series(), pd.Series): assert str_col != null_col else: assert str_col == null_col
def test_to_series(sample_series): data_col = DataColumn(sample_series) series = data_col.to_series() assert series is data_col._series pd.testing.assert_series_equal(to_pandas(series), to_pandas(data_col._series))