def _run_outer_loop( self, input_data: InputDataset, outer_split: Split, data_splitter: DataSplitter, ) -> OuterLoopResults: feature_elimination_results = {} feature_set = list(range(input_data.n_features)) while len(feature_set) >= self._minimum_features: inner_results = [] for inner_split in data_splitter.iter_inner_splits(outer_split): inner_loop_data = data_splitter.split_data( input_data, inner_split, feature_set ) feature_evaluation_results = self._feature_evaluator.evaluate_features( inner_loop_data, feature_set ) inner_results.append(feature_evaluation_results) feature_elimination_results[tuple(feature_set)] = inner_results feature_set = self._remove_features(feature_set, inner_results) outer_loop_results = self._create_outer_loop_results( feature_elimination_results, input_data, outer_split, data_splitter ) return outer_loop_results
def test_split_data(dataset): ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) split = Split(0, [0, 1, 2], [3, 4, 5, 6]) split_data = ds.split_data(dataset, split, features=[0, 1, 2]) assert split_data.test_data.X.shape == (4, 3) assert split_data.test_data.y.size == 4 assert split_data.train_data.X.shape == (3, 3) assert split_data.train_data.y.size == 3
def test_non_randomness(dataset): ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) ds2 = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) assert all( sorted(ds._splits[k].train_indices) == sorted( ds2._splits[k].train_indices) for k in ds._splits)
def test_randomness(dataset): random_state = np.random.RandomState(0) ds = DataSplitter(n_outer=5, n_inner=4, random_state=random_state, input_data=dataset) ds2 = DataSplitter(n_outer=5, n_inner=4, random_state=random_state, input_data=dataset) assert all( sorted(ds._splits[k].train_indices) != sorted( ds2._splits[k].train_indices) for k in ds._splits)
def test_split_separation(dataset): ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) # all indices should appear once if joining outer_test, inner_test and inner_train for outer_split in ds.iter_outer_splits(): for inner_split in ds.iter_inner_splits(outer_split): all_indeces = (list(outer_split.test_indices) + list(inner_split.test_indices) + list(inner_split.train_indices)) assert len(all_indeces) == 12 assert sorted(all_indeces) == list(range(12)) out_train = list(inner_split.test_indices) + list( inner_split.train_indices) assert sorted(out_train) == sorted(outer_split.train_indices)
def test_make_splits_grouped(grouped_dataset): ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=grouped_dataset) groups = grouped_dataset.groups assert ds # check there is no intersection among the groups for outer_split in ds.iter_outer_splits(): train_idx = outer_split.train_indices test_idx = outer_split.test_indices for inner_split in ds.iter_inner_splits(outer_split): inner_train = inner_split.train_indices valid_idx = inner_split.test_indices assert not set(groups[inner_train]).intersection(groups[valid_idx]) assert not set(groups[test_idx]).intersection(groups[valid_idx]) assert not set(groups[inner_train]).intersection(groups[test_idx])
def test_evaluate_features(dataset): pipeline = Pipeline( [("normalizer", Normalizer()), ("model", SVC(kernel="linear", random_state=0))] ) fe = FeatureEvaluator(estimator=pipeline, metric="MISS", random_state=0,) fe.set_n_initial_features(12) ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) split = ds.iter_outer_splits().__next__() evaluation_data = ds.split_data(dataset, split) evaluation = fe.evaluate_features(evaluation_data, [0, 4, 6]) assert evaluation assert evaluation.test_score >= 0 assert evaluation.ranks assert isinstance(evaluation.ranks, FeatureRanks) assert evaluation.ranks.n_feats == 12 assert evaluation.ranks[0] assert evaluation.ranks[1] with pytest.raises(ValueError): _ = evaluation.ranks[100]
def _evaluate_min_mid_and_max_features( self, input_data: InputDataset, best_features: SelectedFeatures, split: Split, data_splitter: DataSplitter, ) -> Tuple[ FeatureEvaluationResults, FeatureEvaluationResults, FeatureEvaluationResults ]: min_feats = best_features["min"] mid_feats = best_features["mid"] max_feats = best_features["max"] data_min_feats = data_splitter.split_data(input_data, split, min_feats) data_mid_feats = data_splitter.split_data(input_data, split, mid_feats) data_max_feats = data_splitter.split_data(input_data, split, max_feats) min_eval = self._feature_evaluator.evaluate_features(data_min_feats, min_feats) mid_eval = self._feature_evaluator.evaluate_features(data_mid_feats, mid_feats) max_eval = self._feature_evaluator.evaluate_features(data_max_feats, max_feats) return min_eval, mid_eval, max_eval
def test_make_splits(dataset): ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) assert ds._splits assert len(ds._splits) == 5 * 4 + 5 split = ds._splits[(0, 2)] train_idx = split.train_indices test_idx = split.test_indices assert len(train_idx) == 6 assert len(test_idx) == 3 split = ds._splits[(0, None)] train_idx = split.train_indices test_idx = split.test_indices assert len(train_idx) == 9 assert len(test_idx) == 3 assert not set(train_idx).intersection(test_idx)
def test_iter_inner_splits(dataset): ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) for outer_split in ds.iter_outer_splits(): for inner_split in ds.iter_inner_splits(outer_split): assert inner_split assert inner_split == ds._splits[(outer_split.id, inner_split.id)]
def test_data_splitter(dataset): ds = DataSplitter(n_outer=5, n_inner=4, random_state=0, input_data=dataset) assert ds assert ds._splits
def fit( self, X: NumpyArray, y: NumpyArray, groups: NumpyArray = None, executor: Executor = None, ) -> FeatureSelector: """ Implements the double CV feature selection algorithm. The method returns the same FeatureSelector. If the samples are correlated, the `group` vector can be used to encode arbitrary domain specific stratifications of the samples as integers (e.g. patient_id, year of collection, etc.). If group is not provided the samples are assumed to be i. i. d. variables. To parallelize the CV repetition, an `executor` can be provided to split the computation across processes or cluster nodes. So far, `loky` (joblib), `dask`, and `concurrent` Executors are tested. Parameters ---------- X : NumpyArray Predictor variables as numpy array y : NumpyArray Response vector (Dependent variable). groups : NumpyArray, optional Group labels for the samples used while splitting the dataset into train/test set, by default None executor : Executor, optional executor instance for parallel computing, by default None Returns ------- FeatureSelector the fit feature selector """ if executor is None: executor = SyncExecutor() size, n_features = X.shape groups = self._get_groups(groups, size) input_data = InputDataset(X=X, y=y, groups=groups) self._feature_evaluator.set_n_initial_features(n_features) log.info( f"Running {self.n_repetitions} repetitions and" f" {self.n_outer} outer loops using " f"executor {executor.__class__.__name__}." ) repetition_results = [] log.info("Scheduling tasks...") Progressbar = self._make_progress_bar() with Progressbar(max_value=self.n_repetitions * self.n_outer) as b: progress = 0 b.update(progress) for _ in range(self.n_repetitions): data_splitter = DataSplitter( self.n_outer, self.n_inner, input_data, self.random_state, ) outer_loop_results = [] for outer_split in data_splitter.iter_outer_splits(): outer_loop_result = self._deferred_run_outer_loop( input_data, outer_split, executor=executor, data_splitter=data_splitter, ) outer_loop_results.append(outer_loop_result) progress += 1 b.update(progress) repetition_results.append(outer_loop_results) self._selected_features = self._select_best_features(repetition_results) log.info("Finished feature selection.") self._n_features = input_data.n_features self.is_fit = True return self