def test_get_with_null_parameters(self): """ Should return all columns and rows except the split column """ dataset = self.dummy_dataset get_dataframe = dataset.get(column=None, split=None) assert_data_container_equal(self.expected_dataframe, get_dataframe)
def test_dataframe(self): """ Should return a copy of the full dataset """ dataset = self.dummy_dataset get_dataframe = dataset._dataframe assert_data_container_equal(self._data, get_dataframe)
def test_save_and_load_parquet(self): save_pattern = "dask_disk_parquet" dataset = self.generate_dataset(save_pattern) df = dataset.dataframe.compute() dataset.save_external_files() dataset.load_external_files() df2 = dataset.dataframe.compute() assert_data_container_equal(df, df2)
def test_x(self): """ Test property wrapper - same as get X """ dataset = self.dummy_dataset get_x = dataset.get(column="X", split=None) assert_data_container_equal(self.expected_x, get_x) assert_data_container_equal(self.expected_x, dataset.X)
def test_y(self): """ Test property wrapper - same as get y """ dataset = self.dummy_dataset get_y = dataset.get(column="y", split=None) assert_data_container_equal(self.expected_y, get_y) assert_data_container_equal(self.expected_y, dataset.y)
def test_save_and_load_csv(self): save_pattern = "dask_disk_csv" dataset = self.generate_dataset(save_pattern) df = dataset.dataframe.compute() # expect a custom index df.index.name = DaskPersistenceMethods.INDEX_COLUMN dataset.save_external_files() dataset.load_external_files() df2 = dataset.dataframe.compute() assert_data_container_equal(df, df2)
def test_get_X_mutability(self): """ Pandas dataframes often return copies and views for efficiency. Views can cause inplace mutations to propagate back to the original dataframe. That is not allowed to maintain the integrity of the persisted data Tests for: - memory pointers (object id) - df._is_copy is not None (weakref when attached to a parent df) - df._is_view is False (True for certain slices) """ dataset = self.dummy_dataset if issubclass(self.dataset_cls, BaseDaskDataset): # dask unmodified_copy = dataset.dataframe.copy() else: unmodified_copy = dataset.dataframe.copy(deep=True) for column, split in itertools.product( ["X"], ["TRAIN", "TEST", "VALIDATION", None]): with self.subTest(column=column, split=split): copy = dataset.get(column=column, split=split) try: # Test for pandas references self.assertIsNone(copy._is_copy) # Not fully understood behavior causes pandas to return views for certain # operations that morph into copies when modified (appears subject to mem optimizations) self.assertFalse(copy._is_view) # Modify copy and see if the source changed # dask doesnt support item assignment copy.loc[1, "a"] = 9876 with self.assertRaises(AssertionError): assert_data_container_equal( copy, dataset.get(column=column, split=split)) except AttributeError: pass assert_data_container_equal(dataset.dataframe, unmodified_copy) # id pointer self.assertNotEqual(id(dataset._external_file), id(copy)) self.assertNotEqual(id(dataset._external_file), id(dataset.dataframe)) self.assertNotEqual(id(dataset._external_file), id(unmodified_copy)) self.assertNotEqual(id(dataset.dataframe), id(copy)) self.assertNotEqual(id(unmodified_copy), id(copy))
def test_dataframe_mutability(self): """ Test mutating dataframe doesnt affect raw data """ dataset = self.dummy_dataset copy = dataset._dataframe try: copy.drop(DATAFRAME_SPLIT_COLUMN, axis=1, inplace=True) except TypeError: # dask copy = copy.drop(DATAFRAME_SPLIT_COLUMN, axis=1) with self.assertRaises(AssertionError): assert_data_container_equal(dataset._dataframe, copy)
def test_get_with_split(self): """ Should return df slices """ dataset = self.dummy_dataset X = dataset.get(column="X", split="TRAIN") y = dataset.get(column="y", split="TRAIN") data = dataset.get(column=None, split="TRAIN") assert_data_container_equal(self.expected_train_x, X) assert_data_container_equal(self.expected_train_y, y) assert_data_container_equal(self.expected_train_dataframe, data)
def compare_datasets(self, new, old): self.compare_hashes(new, old) assert_data_container_equal(new._external_file, old._external_file)
def test_get_nonexistent_split(self): """ Should return an empty frame """ dataset = self.dummy_dataset X = dataset.get(column="X", split="NONSENSE") y = dataset.get(column="y", split="NONSENSE") data = dataset.get(column=None, split="NONSENSE") if isinstance(data, ddDataFrame): # head calls return a pd.DataFrame/series assert_data_container_equal(X.compute(), self.expected_x.head(0)) assert_data_container_equal(y.compute(), self.expected_y.head(0)) assert_data_container_equal(data.compute(), self.expected_dataframe.head(0)) else: assert_data_container_equal(X, self.expected_x.head(0)) assert_data_container_equal(y, self.expected_y.head(0)) assert_data_container_equal(data, self.expected_dataframe.head(0))