def save(self, dataset_id: str, dataset: Dataset) -> DatasetSource: writer = dataset.get_writer() if writer is None: raise ValueError( f'{dataset.dataset_type} does not support artifact persistance' ) reader, artifacts = writer.write(dataset) with artifacts.blob_dict() as blobs: try: pushed = self.repo.push_artifact(self.ARTIFACT_TYPE, dataset_id, blobs) except ArtifactExistsError as e: raise DatasetExistsError(dataset_id, self, e) return ArtifactDatasetSource(reader, pushed, dataset.dataset_type)
def dataset_write_read_check(dataset: Dataset, writer: DatasetWriter = None, reader_type: Type[DatasetReader] = None, custom_eq: Callable[[Any, Any], bool] = None, custom_assert: Callable[[Any, Any], Any] = None): writer = writer or dataset.get_writer() reader, artifacts = writer.write(dataset) if reader_type is not None: assert isinstance(reader, reader_type) new = reader.read(artifacts) assert dataset.dataset_type == new.dataset_type if custom_assert is not None: custom_assert(new.data, dataset.data) else: if custom_eq is not None: assert custom_eq(new.data, dataset.data) else: assert new.data == dataset.data
def data() -> Dataset: return Dataset('abcdefg', TestDatasetType())
def read(self, artifacts: ArtifactCollection) -> Dataset: return Dataset(artifacts.bytes_dict()['data'].decode('utf8'), TestDatasetType())
def test_with_index_complex(data, format): writer = PandasWriter(format) dataset_write_read_check(Dataset.from_object(data), writer, PandasReader, custom_assert=pandas_assert)
def test_with_multiindex(data, format): writer = PandasWriter(format) dataset_write_read_check(Dataset.from_object(data.set_index(['a', 'b'])), writer, PandasReader, custom_assert=pandas_assert)
def test_simple_df(data, format): writer = PandasWriter(format) dataset_write_read_check(Dataset.from_object(data), writer, PandasReader, pd.DataFrame.equals)
def read(self, artifacts: ArtifactCollection) -> Dataset: payload = artifacts.bytes_dict()[OneFileDatasetWriter.FILENAME] return Dataset(self.convert(payload), self.dataset_type)
def test_ndarray_source(): data = np.array([1, 2, 3]) dataset = Dataset.from_object(data) dataset_write_read_check(dataset, custom_eq=np.array_equal)
def read(self, artifacts: ArtifactCollection) -> Dataset: with artifacts.blob_dict() as blobs: with blobs[DATA_FILE].bytestream() as f: data = np.load(f)[DATA_KEY] return Dataset.from_object(data)
def read(self, artifacts: ArtifactCollection) -> Dataset: with artifacts.blob_dict( ) as blobs, blobs[PANDAS_DATA_FILE].bytestream() as b: return Dataset.from_object( self.data_type.align(self.format.read(b)))
def create_dataset(self, data, target=None): # TODO persisting to art repo? return Dataset.from_object(data)
def dataset(pandas_data): return Dataset.from_object(pandas_data)