예제 #1
0
class DataStore:
    # Class vars
    _registered_stores: ClassVar[dict[str, dict[int, Type[T]]]] = {}

    @classmethod
    def store_type(cls, store_class: str,
                   store_version: int) -> Optional[Type[DataStore]]:
        return cls._registered_stores.get(store_class, {}).get(store_version)

    type_factory: ClassVar[StorableTypeFactory]
    version: ClassVar[int]
    metadata: ClassVar[Type[M]]
    columns: ClassVar[list[ColumnAlias]]
    indices: ClassVar[list[IndexAlias]]

    # Instance vars
    columns: list[ColumnAlias]
    _data_backend: B
    loc: DataStore._LocIndexer[T]
    iloc: DataStore._ILocIndexer[T]
    index: Index
    metadata: Optional[M]

    def __init_subclass__(cls: Type[T],
                          version: int = 1,
                          register: bool = True) -> None:
        super(DataStore, cls).__init_subclass__()
        if register and DataStore.store_type(cls.__name__,
                                             version) is not None:
            raise TypeError(
                f"Duplicate DataStore version found for {cls.__name__} version {version}"
            )
        cls.type_factory = StorableTypeFactory(list(cls.__mro__),
                                               cls.__annotations__)
        cls.version = version
        cls.metadata = cls.type_factory.metadata
        cls.columns = []
        cls.indices = []
        for name, col in cls._parse_columns().items():
            cls.columns.append(col)
            setattr(cls, name, col)
        for name, index in cls._parse_indices().items():
            cls.indices.append(index)
            setattr(cls, name, index)
        if register:
            if cls.__name__ is not DataStore._registered_stores:
                DataStore._registered_stores[cls.__name__] = {}
            DataStore._registered_stores[cls.__name__][version] = cls

    def __init__(
        self: T,
        metadata: Optional[M] = None,
        index: Optional[Iterable] = None,
        **column_data: dict[str, list],
    ) -> None:
        self.metadata = metadata
        if len(column_data) > 0:
            column_data = {str(name): col for name, col in column_data.items()}
            self._data_backend = PandasBackend(column_data, index=index)
            self._validate_data_frame()
        else:
            self._data_backend = PandasBackend(index=index)
        self._compile()

    def _compile(self: T) -> None:
        self._attach_columns()
        self._attach_indices()
        self._all_columns = self._parse_columns()
        self._active_columns = self._parse_active_columns()
        self.columns = list(self._active_columns.values())
        self.index = self._data_backend.index
        self.loc = DataStore._LocIndexer[T](self)
        self.iloc = DataStore._ILocIndexer[T](self)

    @classmethod
    def link(cls: Type[T],
             database: D,
             data_token: DataToken,
             read_only: bool = True) -> T:
        from tanuki.data_backend.database_backend import DatabaseBackend

        return cls.from_backend(
            DatabaseBackend[T](cls, database, data_token, read_only=read_only),
            validate=False,
        )

    @classmethod
    def from_rows(
        cls: Type[T],
        data_rows: list[tuple],
        columns: Optional[list[str]] = None,
        metadata: Optional[M] = None,
    ) -> T:
        if columns is None:
            columns = list(cls._parse_columns().keys())
        else:
            columns = [str(col) for col in columns]
        data = DataFrame.from_records(data_rows, columns=columns)
        return cls.from_backend(PandasBackend(data), metadata)

    @classmethod
    def from_pandas(cls: Type[T],
                    data: Union[Series, DataFrame],
                    metadata: Optional[M] = None) -> T:
        return cls.from_backend(PandasBackend(data), metadata)

    def to_pandas(self: T) -> DataFrame:
        return self._data_backend.to_pandas()

    def to_dict(self: T) -> DataFrame:
        return self._data_backend.to_dict()

    @property
    def values(self: T) -> np.array:
        return self._data_backend.values

    @property
    def dtypes(self: T) -> dict[str, DataType]:
        return {col.name: col.dtype for col in self.columns}

    def is_link(self: T) -> bool:
        return self._data_backend.is_link()

    def link_token(self: T) -> Optional[DataToken]:
        return self._data_backend.link_token()

    def load(self: T) -> T:
        return self.from_backend(self._data_backend.load())

    @classmethod
    def from_backend(
        cls: Type[T],
        data_backend: B,
        metadata: Optional[M] = None,
        validate: bool = True,
    ) -> T:
        instance = cls()
        instance.metadata = metadata
        instance._data_backend = data_backend
        if validate:
            instance._validate_data_frame()
        instance._compile()
        return instance

    @classmethod
    def _parse_columns(cls) -> dict[str, ColumnAlias]:
        return cls.type_factory.columns

    @classmethod
    def _parse_indices(cls) -> dict[str, IndexAlias]:
        return cls.type_factory.indices

    def _parse_active_columns(self: T) -> dict[str, ColumnAlias]:
        columns = self._parse_columns()
        backend_columns = [str(col) for col in self._data_backend.columns]
        unmatched_columns = set(backend_columns) - columns.keys()
        if len(unmatched_columns) > 0:
            raise KeyError(
                f"Data backend contains columns which are not supported by {self.__class__.__name__}:\n{unmatched_columns}"
            )

        active_columns = {}
        for col in backend_columns:
            active_columns[col] = columns[col]
        return active_columns

    def _validate_data_frame(self: T) -> None:
        columns = self._parse_active_columns()

        invalid_types = []
        for name, col in columns.items():
            if isinstance(col.dtype, TypeAlias):
                continue
            col_data = self._data_backend[name]
            data_dtype = Column.infer_dtype(name, col_data)
            # TODO: Run in batch
            if data_dtype is not type(None) and data_dtype != col.dtype:
                try:
                    cast_column = col(name, col_data)
                    self._data_backend[name] = cast_column._data_backend
                except:
                    invalid_types.append(name)

        if len(invalid_types) != 0:
            raise TypeError(f"Invalid types provided for: {invalid_types}")

    def _attach_columns(self: T) -> None:
        columns = self._parse_columns()
        active_columns = self._parse_active_columns()
        for name, col in columns.items():
            if name in active_columns:
                data = self._data_backend[name]
                setattr(self, name, col(name, data))
            else:
                setattr(self, name, None)

    def _attach_indices(self: T) -> None:
        indices = self._parse_indices()
        active_columns = self._parse_active_columns()
        for name, alias in indices.items():
            col_names = [col.name for col in alias.columns]
            has_columns = True
            for col in col_names:
                if col not in active_columns:
                    has_columns = False
                    break
            if not has_columns:
                setattr(self, name, None)
            else:
                index = self._data_backend.get_index(alias)
                setattr(self, name, index)

    def __contains__(self: T, key):
        return str(key) in self._all_columns

    def __str__(self: T) -> str:
        if len(self._data_backend) == 0:
            return f"Empty {self.__class__.__name__}"
        else:
            return f"{self.__class__.__name__}\n{self._data_backend}"

    def __repr__(self: T) -> str:
        return str(self)

    def equals(self: T, other) -> bool:
        if not issubclass(type(other), DataStore):
            return False
        oc = cast(DataStore, other)
        return (other.__class__.__name__ == self.__class__.__name__
                and self._data_backend.equals(oc._data_backend)
                and self.metadata == oc.metadata)

    def _get_external_backend(self: T, other: Any) -> None:
        if issubclass(type(other), DataStore):
            if not self.__class__.__name__ == other.__class__.__name__:
                raise UnsupportedOperation(
                    "Cannot compare different DataStore types: " +
                    f"{self.__class__.__name__} vs {other.__class__.__name__}")
            return cast(DataStore, other)._data_backend
        else:
            return other

    def __eq__(self: T, other: Any) -> Any:
        other = self._get_external_backend(other)
        return self._data_backend == other

    def __ne__(self: T, other: Any) -> Any:
        other = self._get_external_backend(other)
        return self._data_backend != other

    def __gt__(self: T, other: Any) -> Any:
        other = self._get_external_backend(other)
        return self._data_backend > other

    def __ge__(self: T, other: Any) -> Any:
        other = self._get_external_backend(other)
        return self._data_backend >= other

    def __lt__(self: T, other: Any) -> Any:
        other = self._get_external_backend(other)
        return self._data_backend < other

    def __le__(self: T, other: Any) -> Any:
        other = self._get_external_backend(other)
        return self._data_backend <= other

    def __len__(self: T) -> int:
        return len(self._data_backend)

    def __iter__(self: T) -> Generator[ColumnAlias, None, None]:
        for column in self._data_backend:
            yield self._active_columns[column]

    def iterrows(self: T) -> Generator[tuple[int, T], None, None]:
        for i, row in self._data_backend.iterrows():
            yield (i, self.from_backend(row, self.metadata))

    def itertuples(self: T, ignore_index: bool = False) -> Generator[tuple]:
        return self._data_backend.itertuples(ignore_index=ignore_index)

    def _get_column(self: T, item: str) -> T:
        if item not in self._all_columns:
            raise ValueError(
                f"Could not match '{item}' to {self.__class__.__name__} column"
            )
        elif item not in self._active_columns:
            return None
        else:
            return self._all_columns[item](item, self._data_backend[item])

    def _get_columns(self: T, columns: list[str]) -> T:
        unused_columns = set(columns) - self._all_columns.keys()
        if len(unused_columns) > 0:
            raise ValueError(
                f"The following columns do not exist in {self.__class__.__name__}: {unused_columns}"
            )
        return self._data_backend.getitems(columns)

    def _get_mask(self: T, mask: list[bool]) -> T:
        return self._data_backend.getmask(mask)

    def query(self: T, query: Optional[Query] = None) -> T:
        return self.from_backend(self._data_backend.query(query),
                                 self.metadata)

    def __getitem__(
        self: T, item: Union[ColumnAlias, list[ColumnAlias], list[bool], Query]
    ) -> Union[Column, T]:
        if issubclass(type(item), Query):
            result = self._data_backend.query(item)
        elif item == "index":
            return self._data_backend.index
        elif type(item) is str or type(item) is ColumnAlias:
            result = self._get_column(str(item))
        elif isinstance(item, Iterable):
            sample = item
            while type(sample) is not str and isinstance(sample, Iterable):
                sample = next(iter(sample))
            value_type = DataType(type(sample))
            if value_type is String or value_type is ColumnAlias:
                result = self._get_columns([str(value) for value in item])
            elif value_type is Boolean:
                result = self._get_mask(item)
            else:
                raise RuntimeError(f"Unknown get item request: {item}")
        else:
            raise RuntimeError(f"Unknown get item request: {item}")

        if issubclass(type(result), DataBackend):
            result = self.from_backend(result, self.metadata)
        return result

    def __getattr__(self: T, name: str) -> Any:
        if name[0] != "_":
            raise AttributeError(
                f"Could not match '{name}' to {self.__class__.__name__} column"
            )

    def set_index(self: T, index: Union[Index, IndexAlias]) -> T:
        return self.from_backend(self._data_backend.set_index(index),
                                 self.metadata)

    def reset_index(self: T) -> T:
        return self.from_backend(self._data_backend.reset_index(),
                                 self.metadata)

    def append(self: T, new_store: T, ignore_index: bool = False) -> T:
        return self.from_backend(
            self._data_backend.append(new_store._data_backend,
                                      ignore_index=ignore_index),
            self.metadata,
        )

    def drop(self: T, indices: list[int]) -> T:
        return self.from_backend(self._data_backend.drop_indices(indices),
                                 self.metadata)

    @classmethod
    def concat(cls: T,
               all_data_stores: list[T],
               ignore_index: bool = False) -> T:
        all_match = all([isinstance(item, cls) for item in all_data_stores])
        if not all_match:
            raise ValueError("All data stores must be same type for concat")

        backend_sample: B = all_data_stores[0]._data_backend
        all_backends = [store._data_backend for store in all_data_stores]
        return cls.from_backend(
            backend_sample.concat(all_backends, ignore_index=ignore_index),
            all_data_stores[0].metadata,
        )

    @classmethod
    def builder(cls: Type[T]) -> DataStore._Builder[T]:
        return DataStore._Builder[cls](cls)

    class _Builder(Generic[T]):
        _store_class: Type[T]
        _column_data: dict[str, Column]
        _row_data: dict[str, list]

        def __init__(self, store_class: Type[T]) -> None:
            self._store_class = store_class
            self._column_data = {}
            self._row_data = {}

        def append_column(self, column_name: str,
                          column_data: Column) -> DataStore._Builder[T]:
            if len(self._row_data) > 0:
                raise UnsupportedOperation(
                    "Cannot insert column when row data present")
            self._column_data[str(column_name)] = column_data
            return self

        def __setitem__(self, column_name: str, column_data) -> None:
            self._column_data[str(column_name)] = column_data

        def append_row(self, **row_data: any) -> DataStore._Builder[T]:
            if len(self._column_data) > 0:
                raise UnsupportedOperation(
                    "Cannot insert row data when column data present")
            for key, value in row_data.items():
                if key not in self._row_data:
                    self._row_data[key] = []
                self._row_data[key].append(value)
            return self

        def build(self, metadata: Optional[M] = None) -> T:
            if len(self._column_data) > 0:
                return self._store_class(**self._column_data,
                                         metadata=metadata)
            else:
                return self._store_class(**self._row_data, metadata=metadata)

    class _ILocIndexer(Generic[T]):
        _data_store: T

        def __init__(self, data_store: T) -> None:
            self._data_store = data_store

        def __getitem__(self, item: Union[int, list, slice]) -> T:
            return self._data_store.from_backend(
                self._data_store._data_backend.iloc[item],
                self._data_store.metadata)

    class _LocIndexer(Generic[T]):
        _data_store: T

        def __init__(self, data_store: T) -> None:
            self._data_store = data_store

        def __getitem__(self, item: Union[Any, list, slice]) -> T:
            return self._data_store.from_backend(
                self._data_store._data_backend.loc[item],
                self._data_store.metadata)
예제 #2
0
class TestPandasBackend:

    def setup_method(self) -> None:
        self.data_backend = PandasBackend(
            DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]})
        )
        self.test_series0 = PandasBackend(Series({"a": "a", "b": 1, "c": True}), index=[0])
        self.test_series2 = PandasBackend(Series({"a": "c", "b": 3, "c": True}), index=[2])

    def test_iloc(self) -> None:
        actual_series = self.data_backend.iloc[0]
        assert_that(actual_series.equals(self.test_series0), is_(True))

    def test_loc(self) -> None:
        test_slice = self.data_backend.iloc[[0, 2]]
        actual_series = test_slice.loc[2]
        assert_that(actual_series.equals(self.test_series2), is_(True))

    def test_to_dict(self) -> None:
        frame_expected_dict = {
            "a": ["a", "b", "c"],
            "b": [1, 2, 3],
            "c": [True, False, True],
        }
        assert_that(self.data_backend.to_dict(), equal_to(frame_expected_dict))
        series_expected_dict = {"a": ["a"], "b": [1], "c": [True]}
        assert_that(self.test_series0.to_dict(), equal_to(series_expected_dict))

    def test_single_row(self) -> None:
        assert_that(self.test_series0["a"].values[0], equal_to("a"))
        assert_that(self.test_series0["b"].values[0], equal_to(1))
        assert_that(self.test_series0["c"].values[0], equal_to(True))

        example_row = self.data_backend.iloc[0]
        assert_that(example_row["a"].values[0], equal_to("a"))
        assert_that(example_row["b"].values[0], equal_to(1))
        assert_that(example_row["c"].values[0], equal_to(True))

    def test_set_index(self) -> None:
        test_slice = self.data_backend.iloc[[0, 2]]
        assert_that(test_slice.index.tolist(), equal_to([0, 2]))
        test_slice = test_slice.set_index(ExampleStore.a_index)
        assert_that(test_slice.index.tolist(), equal_to(["a", "c"]))

    def test_get_index(self) -> None:
        assert_that(self.data_backend.index.tolist(), equal_to([0, 1, 2]))
        test_slice = self.data_backend.iloc[[0, 2]]
        assert_that(test_slice.index.tolist(), equal_to([0, 2]))

    def test_reset_index(self) -> None:
        test_slice = self.data_backend.iloc[[0, 2]]
        assert_that(test_slice.index.tolist(), equal_to([0, 2]))
        test_slice = test_slice.reset_index()
        assert_that(test_slice.index.tolist(), equal_to([0, 1]))

    def test_contains(self) -> None:
        assert_that("a", is_in(self.data_backend))

    def test_len(self) -> None:
        assert_that(len(self.data_backend), equal_to(3))

    def test_columns(self) -> None:
        assert_that(self.data_backend.columns, equal_to(["a", "b", "c"]))
        assert_that(self.test_series0.columns, equal_to(["a", "b", "c"]))

    def test_iter(self) -> None:
        columns = ["a", "b", "c"]
        for actual_col, expected_col in zip(self.data_backend, columns):
            assert_that(actual_col, equal_to(expected_col))

    def test_iterows(self) -> None:
        for i, row in self.data_backend.iterrows():
            iloc_row = self.data_backend.iloc[i]
            assert_that(row.equals(iloc_row), is_(True))

    def test_itertuples(self) -> None:
        for i, a, b, c in self.data_backend.itertuples():
            iloc_row = self.data_backend.iloc[i]
            assert_that(a, equal_to(iloc_row["a"].values[0]))
            assert_that(b, equal_to(iloc_row["b"].values[0]))
            assert_that(c, equal_to(iloc_row["c"].values[0]))

    def test_str(self) -> None:
        expected = "       a  b      c\nindex             \n0      a  1   True\n1      b  2  False\n2      c  3   True"
        assert_that(str(self.data_backend), equal_to(expected))

    def test_repr(self) -> None:
        expected = "       a  b      c\nindex             \n0      a  1   True\n1      b  2  False\n2      c  3   True"
        assert_that(repr(self.data_backend), equal_to(expected))

    def test_is_link(self) -> None:
        assert_that(self.data_backend.is_link(), equal_to(False))

    def test_link_token(self) -> None:
        assert_that(self.data_backend.link_token(), equal_to(None))

    def test_to_pandas(self) -> None:
        expected = DataFrame(
            {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]}
        )
        assert_that(self.data_backend.to_pandas().equals(expected), equal_to(True))

    def test_values(self) -> None:
        expected = DataFrame(
            {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]}
        )
        assert_that(
            np.array_equal(self.data_backend.values, expected.values), equal_to(True)
        )

    def test_dtypes(self) -> None:
        expected = {"a": Object, "b": Int64, "c": Boolean}
        assert_that(self.data_backend.dtypes, equal_to(expected))

    def test_cast_columns(self) -> None:
        expected = DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [1, 0, 1]})
        actual = self.data_backend.cast_columns({"c": int})
        assert_that(np.array_equal(actual.values, expected.values), equal_to(True))

    def test_index(self) -> None:
        expected = PandasIndex(PIndex([0, 1, 2], name="index"), [])
        assert_that(self.data_backend.index.equals(expected), equal_to(True))

        new_frame = self.data_backend.set_index(ExampleStore.ab_index)
        pindex = PIndex([("a", 1), ("b", 2), ("c", 3)])
        pindex.name = "ab_index"
        expected = PandasIndex(pindex, ["a", "b"])
        assert_that(new_frame.index.equals(expected), equal_to(True))

    def test_index_name(self) -> None:
        assert_that(self.data_backend.index_name, equal_to("index"))
        new_frame = self.data_backend.set_index(ExampleStore.ab_index)
        assert_that(new_frame.index_name, equal_to("ab_index"))

    def test_equals(self) -> None:
        test = PandasBackend(
            {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]}
        )
        assert_that(self.data_backend.equals(test), equal_to(True))

        test = PandasBackend(
            {"a": ["a", "b", "d"], "b": [1, 2, 3], "c": [True, False, True]}
        )
        assert_that(self.data_backend.equals(test), equal_to(False))

    def test_eq(self) -> None:
        test = PandasBackend(
            {"a": ["d", "b", "c"], "b": [1, 4, 3], "c": [True, False, False]}
        )
        expected = DataFrame(
            {
                "a": [False, True, True],
                "b": [True, False, True],
                "c": [True, True, False],
            }
        )
        assert_that((self.data_backend == test).equals(expected), equal_to(True))

    def test_ne(self) -> None:
        test = PandasBackend(
            {"a": ["d", "b", "c"], "b": [1, 4, 3], "c": [True, False, False]}
        )
        expected = DataFrame(
            {
                "a": [True, False, False],
                "b": [False, True, False],
                "c": [False, False, True],
            }
        )
        assert_that((self.data_backend != test).equals(expected), equal_to(True))

    def test_gt(self) -> None:
        sample = ExampleStore(b=[1, 2, 3])
        test = ExampleStore(b=[1, 1, 3])
        expected = DataFrame({"b": [False, True, False]})
        assert_that((sample > test).equals(expected), equal_to(True))

    def test_ge(self) -> None:
        sample = ExampleStore(b=[0, 2, 3])
        test = ExampleStore(b=[1, 1, 3])
        expected = DataFrame({"b": [False, True, True]})
        assert_that((sample >= test).equals(expected), equal_to(True))

    def test_lt(self) -> None:
        sample = ExampleStore(b=[0, 2, 3])
        test = ExampleStore(b=[1, 1, 3])
        expected = DataFrame({"b": [True, False, False]})
        assert_that((sample < test).equals(expected), equal_to(True))

    def test_le(self) -> None:
        sample = ExampleStore(b=[0, 2, 3])
        test = ExampleStore(b=[1, 1, 3])
        expected = DataFrame({"b": [True, False, True]})
        assert_that((sample <= test).equals(expected), equal_to(True))

    def test_getitem(self) -> None:
        expected = PandasBackend({"b": [1, 2, 3]})
        assert_that(self.data_backend["b"].equals(expected), equal_to(True))

    def test_getitems(self) -> None:
        expected = PandasBackend({"a": ["a", "b", "c"], "b": [1, 2, 3]})
        assert_that(
            self.data_backend.getitems(["a", "b"]).equals(expected), equal_to(True)
        )

    def test_getmask(self) -> None:
        test = self.data_backend.getmask([True, False, True])
        expected = PandasBackend(
            {"a": ["a", "c"], "b": [1, 3], "c": [True, True]},
            index=PandasIndex(PIndex([0, 2], name="index"), []),
        )
        repr(expected)
        assert_that(test.equals(expected), equal_to(True))

    def test_query(self) -> None:
        query = (ExampleStore.a == "a") | (ExampleStore.b == 3)
        test = self.data_backend.query(query)
        expected = PandasBackend(
            {"a": ["a", "c"], "b": [1, 3], "c": [True, True]},
            index=PandasIndex(PIndex([0, 2], name="index"), []),
        )
        assert_that(test.equals(expected), equal_to(True))

    def test_setitem(self) -> None:
        self.data_backend["a"] = ["d", "e", "f"]
        expected = PandasBackend(
            {"a": ["d", "e", "f"], "b": [1, 2, 3], "c": [True, False, True]}
        )
        assert_that(self.data_backend.equals(expected), equal_to(True))

    def test_append(self) -> None:
        postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]})
        new_frame = self.data_backend.append(postfix, ignore_index=True)
        expected = PandasBackend(
            {
                "a": ["a", "b", "c", "d"],
                "b": [1, 2, 3, 4],
                "c": [True, False, True, False],
            }
        )
        assert_that(new_frame.equals(expected), equal_to(True))

    def test_drop_indices(self) -> None:
        new_frame = self.data_backend.drop_indices([1])
        expected = PandasBackend(
            {"a": ["a", "c"], "b": [1, 3], "c": [True, True]},
            index=PandasIndex(PIndex([0, 2], name="index"), []),
        )
        assert_that(new_frame.equals(expected), equal_to(True))

    def test_concat(self) -> None:
        postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]})
        new_frame = PandasBackend.concat([self.data_backend, postfix], ignore_index=True)
        expected = PandasBackend(
            {
                "a": ["a", "b", "c", "d"],
                "b": [1, 2, 3, 4],
                "c": [True, False, True, False],
            }
        )
        assert_that(new_frame.equals(expected), equal_to(True))