Пример #1
0
    def _run_outer_loop(
        self,
        input_data: InputDataset,
        outer_split: Split,
        data_splitter: DataSplitter,
    ) -> OuterLoopResults:

        feature_elimination_results = {}
        feature_set = list(range(input_data.n_features))

        while len(feature_set) >= self._minimum_features:
            inner_results = []

            for inner_split in data_splitter.iter_inner_splits(outer_split):
                inner_loop_data = data_splitter.split_data(
                    input_data, inner_split, feature_set
                )

                feature_evaluation_results = self._feature_evaluator.evaluate_features(
                    inner_loop_data, feature_set
                )

                inner_results.append(feature_evaluation_results)

            feature_elimination_results[tuple(feature_set)] = inner_results
            feature_set = self._remove_features(feature_set, inner_results)

        outer_loop_results = self._create_outer_loop_results(
            feature_elimination_results, input_data, outer_split, data_splitter
        )

        return outer_loop_results
Пример #2
0
def test_split_data(dataset):
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    split = Split(0, [0, 1, 2], [3, 4, 5, 6])
    split_data = ds.split_data(dataset, split, features=[0, 1, 2])
    assert split_data.test_data.X.shape == (4, 3)
    assert split_data.test_data.y.size == 4
    assert split_data.train_data.X.shape == (3, 3)
    assert split_data.train_data.y.size == 3
Пример #3
0
def test_non_randomness(dataset):
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    ds2 = DataSplitter(n_outer=5,
                       n_inner=4,
                       random_state=0,
                       input_data=dataset)
    assert all(
        sorted(ds._splits[k].train_indices) == sorted(
            ds2._splits[k].train_indices) for k in ds._splits)
Пример #4
0
def test_randomness(dataset):
    random_state = np.random.RandomState(0)
    ds = DataSplitter(n_outer=5,
                      n_inner=4,
                      random_state=random_state,
                      input_data=dataset)
    ds2 = DataSplitter(n_outer=5,
                       n_inner=4,
                       random_state=random_state,
                       input_data=dataset)
    assert all(
        sorted(ds._splits[k].train_indices) != sorted(
            ds2._splits[k].train_indices) for k in ds._splits)
Пример #5
0
def test_split_separation(dataset):
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    # all indices should appear once if joining outer_test, inner_test and inner_train
    for outer_split in ds.iter_outer_splits():
        for inner_split in ds.iter_inner_splits(outer_split):
            all_indeces = (list(outer_split.test_indices) +
                           list(inner_split.test_indices) +
                           list(inner_split.train_indices))
            assert len(all_indeces) == 12
            assert sorted(all_indeces) == list(range(12))
            out_train = list(inner_split.test_indices) + list(
                inner_split.train_indices)
            assert sorted(out_train) == sorted(outer_split.train_indices)
Пример #6
0
def test_make_splits_grouped(grouped_dataset):
    ds = DataSplitter(n_outer=5,
                      n_inner=4,
                      random_state=0,
                      input_data=grouped_dataset)
    groups = grouped_dataset.groups
    assert ds
    # check there is no intersection among the groups
    for outer_split in ds.iter_outer_splits():
        train_idx = outer_split.train_indices
        test_idx = outer_split.test_indices
        for inner_split in ds.iter_inner_splits(outer_split):
            inner_train = inner_split.train_indices
            valid_idx = inner_split.test_indices
            assert not set(groups[inner_train]).intersection(groups[valid_idx])
            assert not set(groups[test_idx]).intersection(groups[valid_idx])
            assert not set(groups[inner_train]).intersection(groups[test_idx])
def test_evaluate_features(dataset):
    pipeline = Pipeline(
        [("normalizer", Normalizer()), ("model", SVC(kernel="linear", random_state=0))]
    )
    fe = FeatureEvaluator(estimator=pipeline, metric="MISS", random_state=0,)
    fe.set_n_initial_features(12)
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    split = ds.iter_outer_splits().__next__()
    evaluation_data = ds.split_data(dataset, split)
    evaluation = fe.evaluate_features(evaluation_data, [0, 4, 6])
    assert evaluation
    assert evaluation.test_score >= 0
    assert evaluation.ranks
    assert isinstance(evaluation.ranks, FeatureRanks)
    assert evaluation.ranks.n_feats == 12
    assert evaluation.ranks[0]
    assert evaluation.ranks[1]
    with pytest.raises(ValueError):
        _ = evaluation.ranks[100]
Пример #8
0
    def _evaluate_min_mid_and_max_features(
        self,
        input_data: InputDataset,
        best_features: SelectedFeatures,
        split: Split,
        data_splitter: DataSplitter,
    ) -> Tuple[
        FeatureEvaluationResults, FeatureEvaluationResults, FeatureEvaluationResults
    ]:
        min_feats = best_features["min"]
        mid_feats = best_features["mid"]
        max_feats = best_features["max"]

        data_min_feats = data_splitter.split_data(input_data, split, min_feats)
        data_mid_feats = data_splitter.split_data(input_data, split, mid_feats)
        data_max_feats = data_splitter.split_data(input_data, split, max_feats)

        min_eval = self._feature_evaluator.evaluate_features(data_min_feats, min_feats)
        mid_eval = self._feature_evaluator.evaluate_features(data_mid_feats, mid_feats)
        max_eval = self._feature_evaluator.evaluate_features(data_max_feats, max_feats)

        return min_eval, mid_eval, max_eval
Пример #9
0
def test_make_splits(dataset):
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    assert ds._splits
    assert len(ds._splits) == 5 * 4 + 5
    split = ds._splits[(0, 2)]
    train_idx = split.train_indices
    test_idx = split.test_indices
    assert len(train_idx) == 6
    assert len(test_idx) == 3
    split = ds._splits[(0, None)]
    train_idx = split.train_indices
    test_idx = split.test_indices
    assert len(train_idx) == 9
    assert len(test_idx) == 3
    assert not set(train_idx).intersection(test_idx)
Пример #10
0
def test_iter_inner_splits(dataset):
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    for outer_split in ds.iter_outer_splits():
        for inner_split in ds.iter_inner_splits(outer_split):
            assert inner_split
            assert inner_split == ds._splits[(outer_split.id, inner_split.id)]
Пример #11
0
def test_data_splitter(dataset):
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    assert ds
    assert ds._splits
Пример #12
0
    def fit(
        self,
        X: NumpyArray,
        y: NumpyArray,
        groups: NumpyArray = None,
        executor: Executor = None,
    ) -> FeatureSelector:
        """
        Implements the double CV feature selection algorithm. The method returns
        the same FeatureSelector. If the samples are correlated, the `group` vector
        can be used to encode arbitrary domain specific stratifications of the
        samples as integers (e.g. patient_id, year of collection, etc.). If group
        is not provided the samples are assumed to be i. i. d. variables.
        To parallelize the CV repetition, an `executor` can be provided to split
        the computation across processes or cluster nodes. So far, `loky` (joblib),
        `dask`, and `concurrent` Executors are tested.

        Parameters
        ----------
        X : NumpyArray
            Predictor variables as numpy array
        y : NumpyArray
            Response vector (Dependent variable).
        groups : NumpyArray, optional
            Group labels for the samples used while splitting the dataset
            into train/test set, by default None
        executor : Executor, optional
            executor instance for parallel computing, by default None

        Returns
        -------
        FeatureSelector
            the fit feature selector
        """

        if executor is None:
            executor = SyncExecutor()

        size, n_features = X.shape
        groups = self._get_groups(groups, size)
        input_data = InputDataset(X=X, y=y, groups=groups)
        self._feature_evaluator.set_n_initial_features(n_features)

        log.info(
            f"Running {self.n_repetitions} repetitions and"
            f" {self.n_outer} outer loops using "
            f"executor {executor.__class__.__name__}."
        )

        repetition_results = []

        log.info("Scheduling tasks...")
        Progressbar = self._make_progress_bar()
        with Progressbar(max_value=self.n_repetitions * self.n_outer) as b:
            progress = 0
            b.update(progress)
            for _ in range(self.n_repetitions):
                data_splitter = DataSplitter(
                    self.n_outer,
                    self.n_inner,
                    input_data,
                    self.random_state,
                )

                outer_loop_results = []
                for outer_split in data_splitter.iter_outer_splits():
                    outer_loop_result = self._deferred_run_outer_loop(
                        input_data,
                        outer_split,
                        executor=executor,
                        data_splitter=data_splitter,
                    )
                    outer_loop_results.append(outer_loop_result)
                    progress += 1
                    b.update(progress)

                repetition_results.append(outer_loop_results)

        self._selected_features = self._select_best_features(repetition_results)
        log.info("Finished feature selection.")
        self._n_features = input_data.n_features
        self.is_fit = True
        return self