예제 #1
0
    def test_singletask_index_k_fold_split(self):
        """
    Test singletask IndexSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        index_splitter = IndexSplitter()
        ids_set = set(solubility_dataset.ids)

        K = 5
        fold_dirs = [tempfile.mkdtemp() for i in range(K)]
        fold_datasets = index_splitter.k_fold_split(solubility_dataset,
                                                    fold_dirs)

        for fold in range(K):
            fold_dataset = fold_datasets[fold]
            # Verify lengths is 10/k == 2
            assert len(fold_dataset) == 2
            # Verify that compounds in this fold are subset of original compounds
            fold_ids_set = set(fold_dataset.ids)
            assert fold_ids_set.issubset(ids_set)
            # Verify that no two folds have overlapping compounds.
            for other_fold in range(K):
                if fold == other_fold:
                    continue
                other_fold_dataset = fold_datasets[other_fold]
                other_fold_ids_set = set(other_fold_dataset.ids)
                assert fold_ids_set.isdisjoint(other_fold_ids_set)

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir, fold_datasets)
        assert len(merged_dataset) == len(solubility_dataset)
        assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
예제 #2
0
  def test_k_fold_splitter(self):
    """
    Test that we can 5 fold index wise over 5 points
    """
    ds = NumpyDataset(np.array(range(5)), np.array(range(5)))
    index_splitter = IndexSplitter()

    K = 5
    fold_datasets = index_splitter.k_fold_split(ds, K)

    for fold in range(K):
      self.assertTrue(fold_datasets[fold][1].X[0] == fold)
      train_data = set(list(fold_datasets[fold][0].X))
      self.assertFalse(fold in train_data)
      self.assertEqual(K - 1, len(train_data))
예제 #3
0
  def test_k_fold_splitter(self):
    """
    Test that we can 5 fold index wise over 5 points
    """
    ds = NumpyDataset(np.array(range(5)), np.array(range(5)))
    index_splitter = IndexSplitter()

    K = 5
    fold_datasets = index_splitter.k_fold_split(ds, K)

    for fold in range(K):
      train, cv = fold_datasets[fold][0], fold_datasets[fold][1]
      self.assertTrue(cv.X[0] == fold)
      train_data = set(list(train.X))
      self.assertFalse(fold in train_data)
      self.assertEqual(K - 1, len(train))
      self.assertEqual(1, len(cv))