Пример #1
0
    def _run_outer_loop(
        self,
        input_data: InputDataset,
        outer_split: Split,
        data_splitter: DataSplitter,
    ) -> OuterLoopResults:

        feature_elimination_results = {}
        feature_set = list(range(input_data.n_features))

        while len(feature_set) >= self._minimum_features:
            inner_results = []

            for inner_split in data_splitter.iter_inner_splits(outer_split):
                inner_loop_data = data_splitter.split_data(
                    input_data, inner_split, feature_set
                )

                feature_evaluation_results = self._feature_evaluator.evaluate_features(
                    inner_loop_data, feature_set
                )

                inner_results.append(feature_evaluation_results)

            feature_elimination_results[tuple(feature_set)] = inner_results
            feature_set = self._remove_features(feature_set, inner_results)

        outer_loop_results = self._create_outer_loop_results(
            feature_elimination_results, input_data, outer_split, data_splitter
        )

        return outer_loop_results
Пример #2
0
def test_split_data(dataset):
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    split = Split(0, [0, 1, 2], [3, 4, 5, 6])
    split_data = ds.split_data(dataset, split, features=[0, 1, 2])
    assert split_data.test_data.X.shape == (4, 3)
    assert split_data.test_data.y.size == 4
    assert split_data.train_data.X.shape == (3, 3)
    assert split_data.train_data.y.size == 3
Пример #3
0
    def _evaluate_min_mid_and_max_features(
        self,
        input_data: InputDataset,
        best_features: SelectedFeatures,
        split: Split,
        data_splitter: DataSplitter,
    ) -> Tuple[
        FeatureEvaluationResults, FeatureEvaluationResults, FeatureEvaluationResults
    ]:
        min_feats = best_features["min"]
        mid_feats = best_features["mid"]
        max_feats = best_features["max"]

        data_min_feats = data_splitter.split_data(input_data, split, min_feats)
        data_mid_feats = data_splitter.split_data(input_data, split, mid_feats)
        data_max_feats = data_splitter.split_data(input_data, split, max_feats)

        min_eval = self._feature_evaluator.evaluate_features(data_min_feats, min_feats)
        mid_eval = self._feature_evaluator.evaluate_features(data_mid_feats, mid_feats)
        max_eval = self._feature_evaluator.evaluate_features(data_max_feats, max_feats)

        return min_eval, mid_eval, max_eval
def test_evaluate_features(dataset):
    pipeline = Pipeline(
        [("normalizer", Normalizer()), ("model", SVC(kernel="linear", random_state=0))]
    )
    fe = FeatureEvaluator(estimator=pipeline, metric="MISS", random_state=0,)
    fe.set_n_initial_features(12)
    ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset)
    split = ds.iter_outer_splits().__next__()
    evaluation_data = ds.split_data(dataset, split)
    evaluation = fe.evaluate_features(evaluation_data, [0, 4, 6])
    assert evaluation
    assert evaluation.test_score >= 0
    assert evaluation.ranks
    assert isinstance(evaluation.ranks, FeatureRanks)
    assert evaluation.ranks.n_feats == 12
    assert evaluation.ranks[0]
    assert evaluation.ranks[1]
    with pytest.raises(ValueError):
        _ = evaluation.ranks[100]