class Column(Generic[T]): name: str _data_backend: B loc: Column._LocIndexer[T] iloc: Column._ILocIndexer[T] dtype: DataType @classmethod def __class_getitem__(_, dtype: T) -> Type[Column]: from .column_alias import ColumnAlias return ColumnAlias(dtype) def __init__( self: Column[T], name: str, data: Optional[list[T]] = None, index: Optional[list] = None, dtype: Optional[T] = None, ) -> None: self.name = name if data is not None: if issubclass(type(data), DataBackend): self._data_backend = data else: self._data_backend = PandasBackend({name: data}, index=index) self.dtype = ( self.infer_dtype(self.name, self._data_backend) if dtype is None else DataType(dtype) ) else: self._data_backend = PandasBackend(index=index) self.dtype = Object if data is not None: self._validate_column() self.loc = Column._LocIndexer[T](self) self.iloc = Column._ILocIndexer[T](self) @staticmethod def determine_nested_dtype(data: Iterable) -> DataType: if len(data) == 0: return Object sample = next(iter(data)) stype = type(sample) if sample == data: return type(data) elif stype is list or stype is set: nested_type = Column.determine_nested_dtype(sample) else: nested_type = stype return GenericAlias(type(data), nested_type) @staticmethod def infer_dtype(column_name: str, data_backend: DataBackend) -> DataType: dtype: type = data_backend.dtypes[column_name] if dtype == Object: if len(data_backend) == 0: dtype = Object else: sample = data_backend.iloc[0].values[0] stype = type(sample) if stype is list or stype is set: dtype = DataType( GenericAlias(stype, Column.determine_nested_dtype(sample)) ) else: dtype = stype return DataType(dtype) def _validate_column(self: Column[T]) -> None: if self._data_backend.is_link() or isinstance(self.dtype, TypeAlias): return data_type = self.infer_dtype(self.name, self._data_backend) if self.dtype != data_type: try: self._data_backend = self._data_backend.cast_columns( {self.name: self.dtype.pdtype()} ) except Exception as e: raise TypeError( f"Failed to cast '{data_type.__name__}' to '{self.dtype.__name__}'", e, ) @property def index(self: Column[T]) -> Index: return self._data_backend.index def tolist(self: Column[T]) -> list: return self._data_backend.values.tolist() @property def values(self: Column[T]) -> np.ndarray: return self._data_backend.values @classmethod def _new_data_copy( cls: type[Column], name: str, data_backend: DataBackend, dtype: type[NT] ) -> "Column[NT]": instance: Column[dtype] = cls(name) instance._data_backend = data_backend instance.dtype = DataType(dtype) return instance def astype(self: Column[T], new_dtype: type[NT]) -> "Column[NT]": return Column( self.name, self._data_backend.cast_columns({self.name: new_dtype}), dtype=new_dtype, ) def first( self: Column[T], n: Optional[int] = 1, offset: Optional[int] = 0 ) -> Column[T]: return Column( self.name, self._data_backend.iloc[self.index[offset : offset + n]], dtype=self.dtype, ) def nunique(self) -> int: return self._data_backend.nunique()[self.name] def equals(self: Column[T], other: Any) -> bool: if type(other) is Column: return self._data_backend.equals(cast(Column, other)._data_backend) else: return self._data_backend.equals(other) def __eq__(self: Column[T], other: Any) -> Column[Boolean]: if isinstance(other, Column): other = other._data_backend return self._data_backend == other def __ne__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend != other def __gt__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend > other def __ge__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend >= other def __lt__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend < other def __le__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend <= other def __len__(self: Column[T]): return len(self._data_backend) def __iter__(self: Column[T]) -> Iterator[T]: return self._data_backend.itertuples() def item(self) -> Any: if len(self) > 1: raise RuntimeError("Cannot call `item` on list of column values") return self.values[0] def __getitem__(self: Column[T], indexable: Indexible) -> Column[T]: return self.loc[indexable] def __str__(self: Column[T]) -> str: result = f"Column {self.name}" if len(self._data_backend) == 0: return f"{result}([], dtype: {self.dtype.__name__})" else: return str(self._data_backend) def __repr__(self: Column[T]) -> str: return str(self) class _ILocIndexer(Generic[T]): _column: Column[T] def __init__(self, column: Column[T]) -> None: self._column = column def __getitem__(self, item: Union[int, list, slice]) -> Column[T]: data = self._column._data_backend.iloc[item] return Column._new_data_copy(self._column.name, data, self._column.dtype) class _LocIndexer(Generic[T]): _column: Column[T] def __init__(self, column: Column[T]) -> None: self._column = column def __getitem__(self, item: Union[Any, list, slice]) -> Column[T]: data = self._column._data_backend.loc[item] return Column._new_data_copy(self._column.name, data, self._column.dtype)
class TestPandasBackend: def setup_method(self) -> None: self.data_backend = PandasBackend( DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]}) ) self.test_series0 = PandasBackend(Series({"a": "a", "b": 1, "c": True}), index=[0]) self.test_series2 = PandasBackend(Series({"a": "c", "b": 3, "c": True}), index=[2]) def test_iloc(self) -> None: actual_series = self.data_backend.iloc[0] assert_that(actual_series.equals(self.test_series0), is_(True)) def test_loc(self) -> None: test_slice = self.data_backend.iloc[[0, 2]] actual_series = test_slice.loc[2] assert_that(actual_series.equals(self.test_series2), is_(True)) def test_to_dict(self) -> None: frame_expected_dict = { "a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True], } assert_that(self.data_backend.to_dict(), equal_to(frame_expected_dict)) series_expected_dict = {"a": ["a"], "b": [1], "c": [True]} assert_that(self.test_series0.to_dict(), equal_to(series_expected_dict)) def test_single_row(self) -> None: assert_that(self.test_series0["a"].values[0], equal_to("a")) assert_that(self.test_series0["b"].values[0], equal_to(1)) assert_that(self.test_series0["c"].values[0], equal_to(True)) example_row = self.data_backend.iloc[0] assert_that(example_row["a"].values[0], equal_to("a")) assert_that(example_row["b"].values[0], equal_to(1)) assert_that(example_row["c"].values[0], equal_to(True)) def test_set_index(self) -> None: test_slice = self.data_backend.iloc[[0, 2]] assert_that(test_slice.index.tolist(), equal_to([0, 2])) test_slice = test_slice.set_index(ExampleStore.a_index) assert_that(test_slice.index.tolist(), equal_to(["a", "c"])) def test_get_index(self) -> None: assert_that(self.data_backend.index.tolist(), equal_to([0, 1, 2])) test_slice = self.data_backend.iloc[[0, 2]] assert_that(test_slice.index.tolist(), equal_to([0, 2])) def test_reset_index(self) -> None: test_slice = self.data_backend.iloc[[0, 2]] assert_that(test_slice.index.tolist(), equal_to([0, 2])) test_slice = test_slice.reset_index() assert_that(test_slice.index.tolist(), equal_to([0, 1])) def test_contains(self) -> None: assert_that("a", is_in(self.data_backend)) def test_len(self) -> None: assert_that(len(self.data_backend), equal_to(3)) def test_columns(self) -> None: assert_that(self.data_backend.columns, equal_to(["a", "b", "c"])) assert_that(self.test_series0.columns, equal_to(["a", "b", "c"])) def test_iter(self) -> None: columns = ["a", "b", "c"] for actual_col, expected_col in zip(self.data_backend, columns): assert_that(actual_col, equal_to(expected_col)) def test_iterows(self) -> None: for i, row in self.data_backend.iterrows(): iloc_row = self.data_backend.iloc[i] assert_that(row.equals(iloc_row), is_(True)) def test_itertuples(self) -> None: for i, a, b, c in self.data_backend.itertuples(): iloc_row = self.data_backend.iloc[i] assert_that(a, equal_to(iloc_row["a"].values[0])) assert_that(b, equal_to(iloc_row["b"].values[0])) assert_that(c, equal_to(iloc_row["c"].values[0])) def test_str(self) -> None: expected = " a b c\nindex \n0 a 1 True\n1 b 2 False\n2 c 3 True" assert_that(str(self.data_backend), equal_to(expected)) def test_repr(self) -> None: expected = " a b c\nindex \n0 a 1 True\n1 b 2 False\n2 c 3 True" assert_that(repr(self.data_backend), equal_to(expected)) def test_is_link(self) -> None: assert_that(self.data_backend.is_link(), equal_to(False)) def test_link_token(self) -> None: assert_that(self.data_backend.link_token(), equal_to(None)) def test_to_pandas(self) -> None: expected = DataFrame( {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.to_pandas().equals(expected), equal_to(True)) def test_values(self) -> None: expected = DataFrame( {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that( np.array_equal(self.data_backend.values, expected.values), equal_to(True) ) def test_dtypes(self) -> None: expected = {"a": Object, "b": Int64, "c": Boolean} assert_that(self.data_backend.dtypes, equal_to(expected)) def test_cast_columns(self) -> None: expected = DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [1, 0, 1]}) actual = self.data_backend.cast_columns({"c": int}) assert_that(np.array_equal(actual.values, expected.values), equal_to(True)) def test_index(self) -> None: expected = PandasIndex(PIndex([0, 1, 2], name="index"), []) assert_that(self.data_backend.index.equals(expected), equal_to(True)) new_frame = self.data_backend.set_index(ExampleStore.ab_index) pindex = PIndex([("a", 1), ("b", 2), ("c", 3)]) pindex.name = "ab_index" expected = PandasIndex(pindex, ["a", "b"]) assert_that(new_frame.index.equals(expected), equal_to(True)) def test_index_name(self) -> None: assert_that(self.data_backend.index_name, equal_to("index")) new_frame = self.data_backend.set_index(ExampleStore.ab_index) assert_that(new_frame.index_name, equal_to("ab_index")) def test_equals(self) -> None: test = PandasBackend( {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(test), equal_to(True)) test = PandasBackend( {"a": ["a", "b", "d"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(test), equal_to(False)) def test_eq(self) -> None: test = PandasBackend( {"a": ["d", "b", "c"], "b": [1, 4, 3], "c": [True, False, False]} ) expected = DataFrame( { "a": [False, True, True], "b": [True, False, True], "c": [True, True, False], } ) assert_that((self.data_backend == test).equals(expected), equal_to(True)) def test_ne(self) -> None: test = PandasBackend( {"a": ["d", "b", "c"], "b": [1, 4, 3], "c": [True, False, False]} ) expected = DataFrame( { "a": [True, False, False], "b": [False, True, False], "c": [False, False, True], } ) assert_that((self.data_backend != test).equals(expected), equal_to(True)) def test_gt(self) -> None: sample = ExampleStore(b=[1, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [False, True, False]}) assert_that((sample > test).equals(expected), equal_to(True)) def test_ge(self) -> None: sample = ExampleStore(b=[0, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [False, True, True]}) assert_that((sample >= test).equals(expected), equal_to(True)) def test_lt(self) -> None: sample = ExampleStore(b=[0, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [True, False, False]}) assert_that((sample < test).equals(expected), equal_to(True)) def test_le(self) -> None: sample = ExampleStore(b=[0, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [True, False, True]}) assert_that((sample <= test).equals(expected), equal_to(True)) def test_getitem(self) -> None: expected = PandasBackend({"b": [1, 2, 3]}) assert_that(self.data_backend["b"].equals(expected), equal_to(True)) def test_getitems(self) -> None: expected = PandasBackend({"a": ["a", "b", "c"], "b": [1, 2, 3]}) assert_that( self.data_backend.getitems(["a", "b"]).equals(expected), equal_to(True) ) def test_getmask(self) -> None: test = self.data_backend.getmask([True, False, True]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) repr(expected) assert_that(test.equals(expected), equal_to(True)) def test_query(self) -> None: query = (ExampleStore.a == "a") | (ExampleStore.b == 3) test = self.data_backend.query(query) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(test.equals(expected), equal_to(True)) def test_setitem(self) -> None: self.data_backend["a"] = ["d", "e", "f"] expected = PandasBackend( {"a": ["d", "e", "f"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(expected), equal_to(True)) def test_append(self) -> None: postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]}) new_frame = self.data_backend.append(postfix, ignore_index=True) expected = PandasBackend( { "a": ["a", "b", "c", "d"], "b": [1, 2, 3, 4], "c": [True, False, True, False], } ) assert_that(new_frame.equals(expected), equal_to(True)) def test_drop_indices(self) -> None: new_frame = self.data_backend.drop_indices([1]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(new_frame.equals(expected), equal_to(True)) def test_concat(self) -> None: postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]}) new_frame = PandasBackend.concat([self.data_backend, postfix], ignore_index=True) expected = PandasBackend( { "a": ["a", "b", "c", "d"], "b": [1, 2, 3, 4], "c": [True, False, True, False], } ) assert_that(new_frame.equals(expected), equal_to(True))