def test_singletask_index_k_fold_split(self): """ Test singletask IndexSplitter class. """ solubility_dataset = self.load_solubility_data() index_splitter = IndexSplitter() ids_set = set(solubility_dataset.ids) K = 5 fold_dirs = [tempfile.mkdtemp() for i in range(K)] fold_datasets = index_splitter.k_fold_split(solubility_dataset, fold_dirs) for fold in range(K): fold_dataset = fold_datasets[fold] # Verify lengths is 10/k == 2 assert len(fold_dataset) == 2 # Verify that compounds in this fold are subset of original compounds fold_ids_set = set(fold_dataset.ids) assert fold_ids_set.issubset(ids_set) # Verify that no two folds have overlapping compounds. for other_fold in range(K): if fold == other_fold: continue other_fold_dataset = fold_datasets[other_fold] other_fold_ids_set = set(other_fold_dataset.ids) assert fold_ids_set.isdisjoint(other_fold_ids_set) merge_dir = tempfile.mkdtemp() merged_dataset = DiskDataset.merge(merge_dir, fold_datasets) assert len(merged_dataset) == len(solubility_dataset) assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
def test_k_fold_splitter(self): """ Test that we can 5 fold index wise over 5 points """ ds = NumpyDataset(np.array(range(5)), np.array(range(5))) index_splitter = IndexSplitter() K = 5 fold_datasets = index_splitter.k_fold_split(ds, K) for fold in range(K): self.assertTrue(fold_datasets[fold][1].X[0] == fold) train_data = set(list(fold_datasets[fold][0].X)) self.assertFalse(fold in train_data) self.assertEqual(K - 1, len(train_data))
def test_k_fold_splitter(self): """ Test that we can 5 fold index wise over 5 points """ ds = NumpyDataset(np.array(range(5)), np.array(range(5))) index_splitter = IndexSplitter() K = 5 fold_datasets = index_splitter.k_fold_split(ds, K) for fold in range(K): train, cv = fold_datasets[fold][0], fold_datasets[fold][1] self.assertTrue(cv.X[0] == fold) train_data = set(list(train.X)) self.assertFalse(fold in train_data) self.assertEqual(K - 1, len(train)) self.assertEqual(1, len(cv))