示例#1
0
 def test_get_with_null_parameters(self):
     """
     Should return all columns and rows except the split column
     """
     dataset = self.dummy_dataset
     get_dataframe = dataset.get(column=None, split=None)
     assert_data_container_equal(self.expected_dataframe, get_dataframe)
示例#2
0
 def test_dataframe(self):
     """
     Should return a copy of the full dataset
     """
     dataset = self.dummy_dataset
     get_dataframe = dataset._dataframe
     assert_data_container_equal(self._data, get_dataframe)
示例#3
0
 def test_save_and_load_parquet(self):
     save_pattern = "dask_disk_parquet"
     dataset = self.generate_dataset(save_pattern)
     df = dataset.dataframe.compute()
     dataset.save_external_files()
     dataset.load_external_files()
     df2 = dataset.dataframe.compute()
     assert_data_container_equal(df, df2)
示例#4
0
 def test_x(self):
     """
     Test property wrapper - same as get X
     """
     dataset = self.dummy_dataset
     get_x = dataset.get(column="X", split=None)
     assert_data_container_equal(self.expected_x, get_x)
     assert_data_container_equal(self.expected_x, dataset.X)
示例#5
0
 def test_y(self):
     """
     Test property wrapper - same as get y
     """
     dataset = self.dummy_dataset
     get_y = dataset.get(column="y", split=None)
     assert_data_container_equal(self.expected_y, get_y)
     assert_data_container_equal(self.expected_y, dataset.y)
示例#6
0
 def test_save_and_load_csv(self):
     save_pattern = "dask_disk_csv"
     dataset = self.generate_dataset(save_pattern)
     df = dataset.dataframe.compute()
     # expect a custom index
     df.index.name = DaskPersistenceMethods.INDEX_COLUMN
     dataset.save_external_files()
     dataset.load_external_files()
     df2 = dataset.dataframe.compute()
     assert_data_container_equal(df, df2)
示例#7
0
    def test_get_X_mutability(self):
        """
        Pandas dataframes often return copies and views for efficiency.
        Views can cause inplace mutations to propagate back to the original
        dataframe. That is not allowed to maintain the integrity of the persisted
        data

        Tests for:
        - memory pointers (object id)
        - df._is_copy is not None (weakref when attached to a parent df)
        - df._is_view is False (True for certain slices)
        """
        dataset = self.dummy_dataset

        if issubclass(self.dataset_cls, BaseDaskDataset):  # dask
            unmodified_copy = dataset.dataframe.copy()
        else:
            unmodified_copy = dataset.dataframe.copy(deep=True)

        for column, split in itertools.product(
            ["X"], ["TRAIN", "TEST", "VALIDATION", None]):
            with self.subTest(column=column, split=split):
                copy = dataset.get(column=column, split=split)

                try:
                    # Test for pandas references
                    self.assertIsNone(copy._is_copy)

                    # Not fully understood behavior causes pandas to return views for certain
                    # operations that morph into copies when modified (appears subject to mem optimizations)
                    self.assertFalse(copy._is_view)

                    # Modify copy and see if the source changed
                    # dask doesnt support item assignment
                    copy.loc[1, "a"] = 9876

                    with self.assertRaises(AssertionError):
                        assert_data_container_equal(
                            copy, dataset.get(column=column, split=split))

                except AttributeError:
                    pass

                assert_data_container_equal(dataset.dataframe, unmodified_copy)

                # id pointer
                self.assertNotEqual(id(dataset._external_file), id(copy))
                self.assertNotEqual(id(dataset._external_file),
                                    id(dataset.dataframe))
                self.assertNotEqual(id(dataset._external_file),
                                    id(unmodified_copy))
                self.assertNotEqual(id(dataset.dataframe), id(copy))
                self.assertNotEqual(id(unmodified_copy), id(copy))
示例#8
0
 def test_dataframe_mutability(self):
     """
     Test mutating dataframe doesnt affect raw data
     """
     dataset = self.dummy_dataset
     copy = dataset._dataframe
     try:
         copy.drop(DATAFRAME_SPLIT_COLUMN, axis=1, inplace=True)
     except TypeError:  # dask
         copy = copy.drop(DATAFRAME_SPLIT_COLUMN, axis=1)
     with self.assertRaises(AssertionError):
         assert_data_container_equal(dataset._dataframe, copy)
示例#9
0
    def test_get_with_split(self):
        """
        Should return df slices
        """
        dataset = self.dummy_dataset
        X = dataset.get(column="X", split="TRAIN")
        y = dataset.get(column="y", split="TRAIN")
        data = dataset.get(column=None, split="TRAIN")

        assert_data_container_equal(self.expected_train_x, X)
        assert_data_container_equal(self.expected_train_y, y)
        assert_data_container_equal(self.expected_train_dataframe, data)
示例#10
0
 def compare_datasets(self, new, old):
     self.compare_hashes(new, old)
     assert_data_container_equal(new._external_file, old._external_file)
示例#11
0
    def test_get_nonexistent_split(self):
        """
        Should return an empty frame
        """
        dataset = self.dummy_dataset
        X = dataset.get(column="X", split="NONSENSE")
        y = dataset.get(column="y", split="NONSENSE")
        data = dataset.get(column=None, split="NONSENSE")

        if isinstance(data, ddDataFrame):
            # head calls return a pd.DataFrame/series
            assert_data_container_equal(X.compute(), self.expected_x.head(0))
            assert_data_container_equal(y.compute(), self.expected_y.head(0))
            assert_data_container_equal(data.compute(),
                                        self.expected_dataframe.head(0))
        else:
            assert_data_container_equal(X, self.expected_x.head(0))
            assert_data_container_equal(y, self.expected_y.head(0))
            assert_data_container_equal(data, self.expected_dataframe.head(0))