def __init__( self: "DatabaseBackend", store_class: Type[T], database: Database, data_token: DataToken, index: Optional[DatabaseIndex] = None, selected_columns: Optional[list[str]] = None, read_only: bool = True, ) -> None: if not read_only: raise NotImplementedError("The current version of Tanuki does not support Store to DB writing") self._store_class = store_class self._database = database self._data_token = data_token self._read_only = read_only if selected_columns is None: self._selected_columns = self._database.table_columns(self._data_token) else: self._selected_columns = selected_columns if index is None: pindex = PIndex(np.arange(0, len(self)), name="index") self._index = PandasIndex(pindex, []) else: if not isinstance(index, DatabaseIndex) and not isinstance( index, PandasIndex ): index = PandasIndex(index) self._index = index self._loc = _LocIndexer(self) self._iloc = _ILocIndexer(self)
def __init__( self, data: Optional[Union(Series, DataFrame, dict[str, list])] = None, index: Optional[PandasIndex] = None, ) -> None: if data is None: self._data = DataFrame(dtype="object") elif type(data) is Series: self._data = cast(Series, data).to_frame().transpose() elif type(data) is DataFrame: self._data = DataFrame(data) elif type(data) is dict: sample_value = next(iter(data.values())) if not isinstance(sample_value, Iterable) or isinstance( sample_value, str): self._data = Series(data).to_frame().transpose() else: self._data = DataFrame(data) else: raise ValueError( f"Received unexpected value type {type(data)}: {data}") if index is None: self._data.index.name = "index" self._index = PandasIndex(self._data.index, []) else: if not isinstance(index, PandasIndex): index = PandasIndex(index) self._data.index = index._data self._index = index self._loc = _LocIndexer(self) self._iloc = _ILocIndexer(self)
def test_index(self) -> None: expected = PandasIndex(PIndex([0, 1, 2], name="index"), []) assert_that(self.data_backend.index.equals(expected), equal_to(True)) new_frame = self.data_backend.set_index(ExampleStore.ab_index) pindex = PIndex([("a", 1), ("b", 2), ("c", 3)]) pindex.name = "ab_index" expected = PandasIndex(pindex, ["a", "b"]) assert_that(new_frame.index.equals(expected), equal_to(True))
def test_to_pandas(self) -> None: assert_that(self.index.to_pandas().equals(self.index), equal_to(True)) expected = PandasIndex(PIndex(["a", "b", "c"], name="a_index"), ["a"]) assert_that(self.a_index.to_pandas().equals(expected), equal_to(True)) multi_index = PIndex([("a", 1), ("b", 2), ("c", 3)]) multi_index.name = "ab_index" expected = PandasIndex(multi_index, ["a", "b"]) assert_that(self.ab_index.to_pandas().equals(expected), equal_to(True))
def test_equals(self): test = PandasIndex(PIndex([0, 1, 2], name="index"), ["a", "b"]) assert_that(self.index.equals(test), equal_to(True)) test = PandasIndex(PIndex([0, 1], name="index"), ["a", "b"]) assert_that(self.index.equals(test), equal_to(False)) test = PandasIndex(PIndex([0, 1, 2], name="index2"), ["a", "b"]) assert_that(self.index.equals(test), equal_to(False)) test = PandasIndex(PIndex([0, 1, 2], name="index"), ["a"]) assert_that(self.index.equals(test), equal_to(False))
def test_getitem(self): expected = PandasIndex(PIndex([1], name="index"), []) assert_that(self.index[1], equal_to(1)) assert_that(self.index[[1]].equals(expected), equal_to(True)) expected = PandasIndex(PIndex(["b"], name="a_index"), ["a"]) assert_that(self.a_index[1], equal_to("b")) assert_that(self.a_index[[1]].equals(expected), equal_to(True)) multi_index = PIndex([("b", 2)]) multi_index.name = "ab_index" expected = PandasIndex(multi_index, ["a", "b"]) assert_that(self.ab_index[1], equal_to(("b", 2))) assert_that(self.ab_index[[1]].equals(expected), equal_to(True))
def test_drop_indices(self) -> None: new_frame = self.data_backend.drop_indices([1]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(new_frame.equals(expected), equal_to(True))
def test_query(self) -> None: query = (ExampleStore.a == "a") | (ExampleStore.b == 3) test = self.data_backend.query(query) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(test.equals(expected), equal_to(True))
def test_getmask(self) -> None: test = self.data_backend.getmask([True, False, True]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) repr(expected) assert_that(test.equals(expected), equal_to(True))
def test_equals(self): test = PandasIndex(PIndex([0, 1, 2], name="index"), []) assert_that(self.index.equals(test), equal_to(True)) test = DatabaseIndex("a_index", self.backend["a"]) assert_that(self.a_index.equals(test), equal_to(True)) test = DatabaseIndex("ab_index", self.backend.getitems(["a", "b"])) assert_that(self.ab_index.equals(test), equal_to(True))
def reset_index(self) -> PandasBackend: pindex = PIndex(np.arange(0, len(self)), name="index") return DatabaseBackend( self._store_class, self._database, self._data_token, index=PandasIndex(pindex, []), selected_columns=self._selected_columns, read_only=self._read_only, )
def setup_method(self): self.index = PandasIndex(PIndex(np.arange(0, 3), name="index"), ["a", "b"])
class PandasBackend(DataBackend): _data: DataFrame _index: PandasIndex _loc: _LocIndexer _iloc: _ILocIndexer def __init__( self, data: Optional[Union(Series, DataFrame, dict[str, list])] = None, index: Optional[PandasIndex] = None, ) -> None: if data is None: self._data = DataFrame(dtype="object") elif type(data) is Series: self._data = cast(Series, data).to_frame().transpose() elif type(data) is DataFrame: self._data = DataFrame(data) elif type(data) is dict: sample_value = next(iter(data.values())) if not isinstance(sample_value, Iterable) or isinstance( sample_value, str): self._data = Series(data).to_frame().transpose() else: self._data = DataFrame(data) else: raise ValueError( f"Received unexpected value type {type(data)}: {data}") if index is None: self._data.index.name = "index" self._index = PandasIndex(self._data.index, []) else: if not isinstance(index, PandasIndex): index = PandasIndex(index) self._data.index = index._data self._index = index self._loc = _LocIndexer(self) self._iloc = _ILocIndexer(self) def is_link(self) -> bool: return False def link_token(self) -> Optional[DataToken]: return None def to_pandas(self) -> DataFrame: return self._data @property def columns(self) -> list[str]: return self._data.columns.tolist() @property def values(self) -> np.ndarray: data_values = self._data.values shape = data_values.shape if shape[1] == 1: return np.squeeze(data_values, axis=1) elif shape[0] == 1: return np.squeeze(data_values, axis=0) else: return data_values @property def dtypes(self) -> dict[str, DataType]: return { col: DataType(dtype) for col, dtype in self._data.dtypes.items() } def cast_columns(self, column_dtypes: dict[str, type]) -> PandasBackend: return PandasBackend(self._data.astype(column_dtypes, errors="ignore")) def to_dict(self) -> dict[str, any]: return self._data.to_dict("list") @property def index(self) -> Index: return self._index @property def index_name(self) -> Union[str, list[str]]: return self._data.index.name @property def loc(self: PandasBackend) -> LocIndexer[PandasBackend]: return self._loc @property def iloc(self: PandasBackend) -> ILocIndexer[PandasBackend]: return self._iloc def equals(self, other: PandasBackend) -> bool: if type(other) is not PandasBackend: return False return np.array_equal(self._data.values, other._data.values) and self._index.equals( other._index) def __eq__(self, other) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data == other def __ne__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data != other def __gt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data > other def __ge__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data >= other def __lt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data < other def __le__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data <= other def __len__(self) -> int: return len(self._data) def __iter__(self) -> Generator[str, None, None]: return iter(self._data) def iterrows(self) -> Generator[tuple[int, PandasBackend], None, None]: for i, row in self._data.iterrows(): yield (i, PandasBackend(row.to_frame().transpose())) def itertuples(self, ignore_index: bool = False): for values in self._data.itertuples(index=not ignore_index): yield values def __getitem__(self, item: str) -> Any: return PandasBackend(self._data[item].to_frame()) def getitems(self, items: list[str]) -> PandasBackend: return PandasBackend(self._data[items]) def getmask(self, mask: list[bool]) -> PandasBackend: return PandasBackend(self._data[mask]) def query(self, query: "Query") -> PandasBackend: from tanuki.database.adapter.query.pandas_query_compiler import PandasQueryCompiler query_compiler = PandasQueryCompiler(self._data) query = query_compiler.compile(query) return PandasBackend(self._data[query]) def __setitem__(self, items: str, value: Any) -> None: if isinstance(value, PandasBackend): value = value._data self._data[items] = value def get_index(self, index_alias: IndexAlias) -> Index: cols = [str(col) for col in index_alias.columns] new_data = self._data.set_index(cols) new_data.index.name = index_alias.name return PandasIndex(new_data.index, cols) def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend: cols = [str(col) for col in index.columns] new_data = self._data.set_index(cols) new_data.index.name = index.name new_index = PandasIndex(new_data.index, cols) return PandasBackend(new_data, new_index) def reset_index(self: PandasBackend) -> PandasBackend: new_data = self._data.reset_index(drop=True) new_data.index.name = "index" new_index = PandasIndex(new_data.index, []) return PandasBackend(new_data, new_index) def append( self: PandasBackend, new_backend: PandasBackend, ignore_index: bool = False, ) -> PandasBackend: return PandasBackend( self._data.append(new_backend._data, ignore_index=ignore_index)) def drop_indices(self: PandasBackend, indices: list[int]) -> PandasBackend: return PandasBackend(self._data.drop(indices)) @classmethod def concat( cls: type[PandasBackend], all_backends: list[PandasBackend], ignore_index: bool = False, ) -> PandasBackend: all_data = [backend._data for backend in all_backends] return PandasBackend(pd.concat(all_data, ignore_index=ignore_index)) def nunique(self) -> int: return self._data.nunique() def __str__(self) -> str: return str(self._data) def __repr__(self) -> str: return str(self)
def reset_index(self: PandasBackend) -> PandasBackend: new_data = self._data.reset_index(drop=True) new_data.index.name = "index" new_index = PandasIndex(new_data.index, []) return PandasBackend(new_data, new_index)
def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend: cols = [str(col) for col in index.columns] new_data = self._data.set_index(cols) new_data.index.name = index.name new_index = PandasIndex(new_data.index, cols) return PandasBackend(new_data, new_index)
def get_index(self, index_alias: IndexAlias) -> Index: cols = [str(col) for col in index_alias.columns] new_data = self._data.set_index(cols) new_data.index.name = index_alias.name return PandasIndex(new_data.index, cols)
def test_getitem(self): expected = PandasIndex(PIndex([1], name="index"), ["a", "b"]) assert_that(self.index[1], equal_to(1)) assert_that(self.index[[1]].equals(expected), equal_to(True))
class TestPandasIndex: def setup_method(self): self.index = PandasIndex(PIndex(np.arange(0, 3), name="index"), ["a", "b"]) def test_name(self): assert_that(self.index.name, equal_to("index")) def test_columns(self): assert_that(self.index.columns, equal_to(["a", "b"])) def test_to_pandas(self) -> None: assert_that(self.index.to_pandas().equals(self.index), equal_to(True)) def test_getitem(self): expected = PandasIndex(PIndex([1], name="index"), ["a", "b"]) assert_that(self.index[1], equal_to(1)) assert_that(self.index[[1]].equals(expected), equal_to(True)) def test_values(self): assert_that(np.array_equal(self.index.values, np.array([0, 1, 2])), equal_to(True)) def test_tolist(self): assert_that(self.index.tolist(), equal_to([0, 1, 2])) def test_equals(self): test = PandasIndex(PIndex([0, 1, 2], name="index"), ["a", "b"]) assert_that(self.index.equals(test), equal_to(True)) test = PandasIndex(PIndex([0, 1], name="index"), ["a", "b"]) assert_that(self.index.equals(test), equal_to(False)) test = PandasIndex(PIndex([0, 1, 2], name="index2"), ["a", "b"]) assert_that(self.index.equals(test), equal_to(False)) test = PandasIndex(PIndex([0, 1, 2], name="index"), ["a"]) assert_that(self.index.equals(test), equal_to(False)) def test_eq(self): expected = np.array([False, True, False]) actual = self.index == 1 assert_that(np.array_equal(actual, expected), equal_to(True)) def test_ne(self): expected = np.array([True, False, True]) actual = self.index != 1 assert_that(np.array_equal(actual, expected), equal_to(True)) def test_gt(self): expected = np.array([False, False, True]) actual = self.index > 1 assert_that(np.array_equal(actual, expected), equal_to(True)) def test_ge(self): expected = np.array([False, True, True]) actual = self.index >= 1 assert_that(np.array_equal(actual, expected), equal_to(True)) def test_lt(self): expected = np.array([True, False, False]) actual = self.index < 1 assert_that(np.array_equal(actual, expected), equal_to(True)) def test_le(self): expected = np.array([True, True, False]) actual = self.index <= 1 assert_that(np.array_equal(actual, expected), equal_to(True)) def test_len(self) -> int: assert_that(len(self.index), equal_to(3)) def test_str(self) -> str: assert_that(str(self.index), equal_to("Int64Index([0, 1, 2], dtype='int64', name='index')")) def test_repr(self) -> str: assert_that(repr(self.index), equal_to("Int64Index([0, 1, 2], dtype='int64', name='index')"))