/
feature_selection.py
747 lines (598 loc) · 26.1 KB
/
feature_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from sklearn.utils import safe_mask, check_array
from sklearn.utils.sparsefuncs import mean_variance_axis
from warnings import warn
import numpy as np
from decog.estimators import RoiEnsemble
class MultiRoiVarianceThreshold(BaseEstimator, SelectorMixin):
"""Feature selector that removes all low-variance features.
This feature selection algorithm looks only at the features (X), not the
desired outputs (y), and can thus be used for unsupervised learning.
Read more in the :ref:`User Guide <variance_threshold>`.
Parameters
----------
threshold : float, optional
Features with a training-set variance lower than this threshold will
be removed. The default is to keep all features with non-zero variance,
i.e. remove the features that have the same value in all samples.
Attributes
----------
variances_ : array, shape (n_features,)
Variances of individual features.
"""
def __init__(self, threshold=0., allow_empty_roi=True):
self.threshold = threshold
self.allow_empty_roi = allow_empty_roi
def fit(self, X, y=None):
"""Learn empirical variances from X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Sample vectors from which to compute variances.
y : any
Ignored. This parameter exists only for compatibility with
sklearn.pipeline.Pipeline.
Returns
-------
self
"""
if not isinstance(X, dict):
raise ValueError('X must be a dict')
self.variances_ = {k: np.var(x, axis=0) for k, x in X.items()}
roi_no_sample = [k for k, v in self.variances_.items() if np.all(v <= self.threshold)]
self.roi_id_valid = np.setdiff1d(list(self.variances_.keys()), roi_no_sample, assume_unique=True)
if len(roi_no_sample) == len(self.variances_):
raise ValueError("No feature in any roi meets the variance threshold %.5f" % self.threshold)
elif np.any(roi_no_sample):
message = "No feature in roi(s) %s meets the variance threshold %.5f" % (list(roi_no_sample), self.threshold)
if self.allow_empty_roi:
warn(message, UserWarning)
else:
raise ValueError(message)
# for i, var in enumerate(self.variances_):
# if np.all(var <= self.threshold):
# msg = "No feature in roi %g meets the variance threshold {0:.5f}" % i
# if X[i].shape[0] == 1:
# msg += " (roi contains only one sample)"
# raise ValueError(msg.format(self.threshold))
return self
def transform(self, X):
"""Reduce X to the selected features.
Parameters
----------
X : array of shape [n_samples, n_features]
The input samples.
Returns
-------
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
masks = self.get_support()
X_r = dict()
for roi_id in self.roi_id_valid:
mask = masks[roi_id]
if len(mask) != X[roi_id].shape[1]:
raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
if not mask.any():
warn("No features were selected in roi %g: either the data is"
" too noisy or the selection test too strict." % roi_id,
UserWarning)
X_r[roi_id] = np.empty(0).reshape((X[roi_id].shape[0], 0))
else:
X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], mask)]
return X_r
def _get_support_mask(self):
support_masks = {k: v > self.threshold for k, v in self.variances_.items()}
return support_masks
class MultiRoiVariancePercentile(BaseEstimator, SelectorMixin):
"""Feature selector that removes all low-variance features.
This feature selection algorithm looks only at the features (X), not the
desired outputs (y), and can thus be used for unsupervised learning.
Read more in the :ref:`User Guide <variance_threshold>`.
Parameters
----------
threshold : float, optional
Features with a training-set variance lower than this threshold will
be removed. The default is to keep all features with non-zero variance,
i.e. remove the features that have the same value in all samples.
Attributes
----------
variances_ : array, shape (n_features,)
Variances of individual features.
"""
def __init__(self, threshold=0., allow_empty_roi=True):
self.threshold = threshold
self.allow_empty_roi = allow_empty_roi
def fit(self, X, y=None):
"""Learn empirical variances from X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Sample vectors from which to compute variances.
y : any
Ignored. This parameter exists only for compatibility with
sklearn.pipeline.Pipeline.
Returns
-------
self
"""
if not isinstance(X, dict):
raise ValueError('X must be a dict')
self.variances_ = {k: np.var(x, axis=0) for k, x in X.items()}
# for i, var in enumerate(self.variances_):
# if np.all(var <= self.threshold):
# msg = "No feature in roi %g meets the variance threshold {0:.5f}" % i
# if X[i].shape[0] == 1:
# msg += " (roi contains only one sample)"
# raise ValueError(msg.format(self.threshold))
return self
def transform(self, X):
"""Reduce X to the selected features.
Parameters
----------
X : array of shape [n_samples, n_features]
The input samples.
Returns
-------
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
masks = self.get_support()
X_r = dict()
for roi_id in self.variances_.keys():
mask = masks[roi_id]
if len(mask) != X[roi_id].shape[1]:
raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], mask)]
return X_r
def _get_support_mask(self):
support_masks = {k: v > np.percentile(v, self.threshold) for k, v in self.variances_.items()}
return support_masks
from sklearn.feature_selection.univariate_selection import _BaseFilter, f_classif, _clean_nans
from scipy import stats
class MuliRoiSelectPercentile(_BaseFilter):
"""Select features according to a percentile of the highest scores.
Read more in the :ref:`User Guide <univariate_feature_selection>`.
Parameters
----------
score_func : callable
Function taking two arrays X and y, and returning a pair of arrays
(scores, pvalues).
percentile : int, optional, default=10
Percent of features to keep.
Attributes
----------
scores_ : array-like, shape=(n_features,)
Scores of features.
pvalues_ : array-like, shape=(n_features,)
p-values of feature scores.
Notes
-----
Ties between features with equal scores will be broken in an unspecified
way.
See also
--------
f_classif: ANOVA F-value between labe/feature for classification tasks.
chi2: Chi-squared stats of non-negative features for classification tasks.
f_regression: F-value between label/feature for regression tasks.
SelectKBest: Select features based on the k highest scores.
SelectFpr: Select features based on a false positive rate test.
SelectFdr: Select features based on an estimated false discovery rate.
SelectFwe: Select features based on family-wise error rate.
GenericUnivariateSelect: Univariate feature selector with configurable mode.
"""
def __init__(self, score_func=f_classif, percentile=10):
super(MuliRoiSelectPercentile, self).__init__(score_func)
self.percentile = percentile
def fit(self, X, y):
"""Run score function on (X, y) and get the appropriate features.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples]
The target values (class labels in classification, real numbers in
regression).
Returns
-------
self : object
Returns self.
"""
if not callable(self.score_func):
raise TypeError("The score function should be a callable, %s (%s) "
"was passed."
% (self.score_func, type(self.score_func)))
self.scores_ = dict()
self.pvalues_ = dict()
for k, x in X.items():
score_, pvalue_ = self.score_func(x, y)
self.scores_[k] = np.asarray(score_)
self.pvalues_[k] = np.asarray(pvalue_)
return self
def transform(self, X):
"""Reduce X to the selected features.
Parameters
----------
X : array of shape [n_samples, n_features]
The input samples.
Returns
-------
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
masks = self.get_support()
X_r = dict()
for roi_id, mask in masks.items():
if len(mask) != X[roi_id].shape[1]:
raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
if not mask.any():
warn("No features were selected in roi %g: either the data is"
" too noisy or the selection test too strict." % roi_id,
UserWarning)
X_r[roi_id] = np.empty(0).reshape((X[roi_id].shape[0], 0))
else:
X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], mask)]
return X_r
def _check_params(self, X, y):
if not 0 <= self.percentile <= 100:
raise ValueError("percentile should be >=0, <=100; got %r"
% self.percentile)
def _get_support_mask(self):
check_is_fitted(self, 'scores_')
# Cater for NaNs
if self.percentile == 100:
return {k: np.ones(len(score), dtype=np.bool) for k, score in self.scores_.items()}
elif self.percentile == 0:
return {k: np.zeros(len(score), dtype=np.bool) for k, score in self.scores_.items()}
masks = dict()
for roi_id, score in self.scores_.items():
scores = _clean_nans(score)
if len(scores) == 1:
mask = np.array([True])
else:
treshold = stats.scoreatpercentile(scores,
100 - self.percentile)
mask = scores > treshold
ties = np.where(scores == treshold)[0]
if len(ties):
max_feats = len(scores) * self.percentile // 100
kept_ties = ties[:max_feats - mask.sum()]
mask[kept_ties] = True
masks[roi_id] = mask
return masks
from sklearn.utils.validation import NotFittedError, check_is_fitted
from sklearn.feature_selection.from_model import _calculate_threshold, _get_feature_importances
from sklearn.base import clone
class MultiRoiSelectFromModel(BaseEstimator, SelectorMixin):
"""Meta-transformer for selecting features based on importance weights.
Parameters
----------
estimator : object
The base estimator from which the transformer is built.
This can be both a fitted (if ``prefit`` is set to True)
or a non-fitted estimator.
threshold : string, float, optional default None
The threshold value to use for feature selection. Features whose
importance is greater or equal are kept while the others are
discarded. If "median" (resp. "mean"), then the ``threshold`` value is
the median (resp. the mean) of the feature importances. A scaling
factor (e.g., "1.25*mean") may also be used. If None and if the
estimator has a parameter penalty set to l1, either explicitly
or implicity (e.g, Lasso), the threshold is used is 1e-5.
Otherwise, "mean" is used by default.
prefit : bool, default False
Whether a prefit model is expected to be passed into the constructor
directly or not. If True, ``transform`` must be called directly
and SelectFromModel cannot be used with ``cross_val_score``,
``GridSearchCV`` and similar utilities that clone the estimator.
Otherwise train the model using ``fit`` and then ``transform`` to do
feature selection.
Attributes
----------
`estimator_`: an estimator
The base estimator from which the transformer is built.
This is stored only when a non-fitted estimator is passed to the
``SelectFromModel``, i.e when prefit is False.
`threshold_`: float
The threshold value used for feature selection.
"""
def __init__(self, estimator, threshold=None, allow_empty_roi=True):
self.estimator = estimator
self.threshold = threshold
self.allow_empty_roi = allow_empty_roi
def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
Returns self.
"""
if not isinstance(X, dict):
raise ValueError('X must be a dict')
if isinstance(self.estimator, RoiEnsemble):
self.estimator_ = self.estimator
self.estimator_.fit(X, y, **fit_params)
else:
self.estimator_ = dict()
for roi_id, x in X.items():
self.estimator_[roi_id] = clone(self.estimator)
self.estimator_[roi_id].fit(x, y, **fit_params)
self.masks = self.get_support()
roi_no_sample = [k for k, mask in self.masks.items() if not np.any(mask)]
self.roi_id_valid = np.setdiff1d(list(self.masks.keys()), roi_no_sample, assume_unique=True)
if len(roi_no_sample) == len(self.masks):
raise ValueError("No feature in any roi meets the required threshold")
elif np.any(roi_no_sample):
message = "No feature in roi(s) %s meets the threshold(s) %s" % \
(roi_no_sample, [self.threshold_[k] for k in roi_no_sample])
if self.allow_empty_roi:
warn(message, UserWarning)
else:
raise ValueError(message)
return self
def partial_fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer only once.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
Returns self.
"""
raise RuntimeError('Partial Fit not supported')
# if not isinstance(X, list):
# raise ValueError('X must be a list')
#
# self.estimator_ = dict()
# for roi_id, x in X.items():
# self.estimator_[roi_id] = clone(self.estimator)
# self.estimator_[roi_id].partial_fit(x, y, **fit_params)
#
# return self
def transform(self, X):
"""Reduce X to the selected features.
Parameters
----------
X : array of shape [n_samples, n_features]
The input samples.
Returns
-------
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
X_r = dict()
for roi_id in self.roi_id_valid:
if len(self.masks[roi_id]) != X[roi_id].shape[1]:
raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
if not self.masks[roi_id].any():
warn("No features were selected in roi %g: either the data is"
" too noisy or the selection test too strict." % roi_id,
UserWarning)
X_r[roi_id] = np.empty(0).reshape((X[roi_id].shape[0], 0))
else:
X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], self.masks[roi_id])]
return X_r
def _get_support_mask(self):
if hasattr(self, 'estimator_'):
if isinstance(self.estimator_, dict):
estimators = self.estimator_
else:
estimators = self.estimator_.estimators_
else:
raise NotFittedError('Fit the model before transform')
self.threshold_, masks = dict(), dict()
for roi_id, estimator in estimators.items():
score_ = _get_feature_importances(estimator)
self.threshold_[roi_id] = _calculate_threshold(estimator, score_, self.threshold)
masks[roi_id] = np.atleast_1d(score_ >= self.threshold_[roi_id])
return masks
class SelectRoisFromModel(BaseEstimator, SelectorMixin):
"""Meta-transformer for selecting features based on importance weights.
Parameters
----------
estimator : object
The base estimator from which the transformer is built.
This can be both a fitted (if ``prefit`` is set to True)
or a non-fitted estimator.
threshold : string, float, optional default None
The threshold value to use for feature selection. Features whose
importance is greater or equal are kept while the others are
discarded. If "median" (resp. "mean"), then the ``threshold`` value is
the median (resp. the mean) of the feature importances. A scaling
factor (e.g., "1.25*mean") may also be used. If None and if the
estimator has a parameter penalty set to l1, either explicitly
or implicity (e.g, Lasso), the threshold is used is 1e-5.
Otherwise, "mean" is used by default.
prefit : bool, default False
Whether a prefit model is expected to be passed into the constructor
directly or not. If True, ``transform`` must be called directly
and SelectFromModel cannot be used with ``cross_val_score``,
``GridSearchCV`` and similar utilities that clone the estimator.
Otherwise train the model using ``fit`` and then ``transform`` to do
feature selection.
Attributes
----------
`estimator_`: an estimator
The base estimator from which the transformer is built.
This is stored only when a non-fitted estimator is passed to the
``SelectFromModel``, i.e when prefit is False.
`threshold_`: float
The threshold value used for feature selection.
"""
def __init__(self, estimator, criterion=None):
self.estimator = estimator
self.criterion = criterion
def fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
Returns self.
"""
if not isinstance(X, dict):
raise ValueError('X must be a dict')
if isinstance(self.estimator, RoiEnsemble):
self.estimator_ = self.estimator
self.estimator_.fit(X, y, **fit_params)
else:
self.estimator_ = dict()
for roi_id, x in X.items():
self.estimator_[roi_id] = clone(self.estimator)
self.estimator_[roi_id].fit(x, y, **fit_params)
self.masks = self.get_support()
return self
def partial_fit(self, X, y=None, **fit_params):
"""Fit the SelectFromModel meta-transformer only once.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
**fit_params : Other estimator specific parameters
Returns
-------
self : object
Returns self.
"""
raise RuntimeError('Partial Fit not supported')
# if not isinstance(X, list):
# raise ValueError('X must be a list')
#
# self.estimator_ = dict()
# for roi_id, x in X.items():
# self.estimator_[roi_id] = clone(self.estimator)
# self.estimator_[roi_id].partial_fit(x, y, **fit_params)
#
# return self
def transform(self, X):
"""Reduce X to the selected features.
Parameters
----------
X : array of shape [n_samples, n_features]
The input samples.
Returns
-------
X_r : array of shape [n_samples, n_selected_features]
The input samples with only the selected features.
"""
return {k: X[k] for k in self.masks}
def _get_support_mask(self):
if hasattr(self, 'estimator_'):
if isinstance(self.estimator_, dict):
estimators = self.estimator_
else:
estimators = self.estimator_.estimators_
else:
raise NotFittedError('Fit the model before transform')
# if len(estimators) is already 1, no further feature selection reasonable
if self.criterion is None or len(estimators) == 1:
if len(estimators) == 1:
warn('Skipping ROI feature selection, because otherwise no ROI would be left.')
return list(estimators.keys())
else:
scores = dict()
for roi_id, estimator in estimators.items():
scores[roi_id] = np.mean(_get_feature_importances(estimator))
scores_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True)
if self.criterion < 1: # proportion
return [x[0] for x in scores_sorted[:max(1, round(self.criterion * len(scores)))]]
else:
return [x[0] for x in scores_sorted[:self.criterion]]
# def _get_support_mask(self):
# if not hasattr(self, 'estimator_'):
# raise NotFittedError('Fit the model before transform')
#
# if isinstance(self.estimator_, dict):
# if self.criterion is None:
# return list(self.estimator_.keys())
# else:
# scores = dict()
# for roi_id, estimator in self.estimator_.items():
# scores[roi_id] = np.mean(_get_feature_importances(estimator))
#
# scores_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#
# if self.criterion < 1: # proportion
# return [x[0] for x in scores_sorted[:max(1, round(self.criterion * len(scores)))]]
# else:
# return [x[0] for x in scores_sorted[:self.criterion]]
# else:
# if self.criterion is None:
# return list(self.estimator_.keys())
# else:
class VariancePercentile(BaseEstimator, SelectorMixin):
"""Feature selector that removes all low-variance features.
This feature selection algorithm looks only at the features (X), not the
desired outputs (y), and can thus be used for unsupervised learning.
Read more in the :ref:`User Guide <variance_threshold>`.
Parameters
----------
threshold : float, optional
Features with a training-set variance lower than this threshold will
be removed. The default is to keep all features with non-zero variance,
i.e. remove the features that have the same value in all samples.
Attributes
----------
variances_ : array, shape (n_features,)
Variances of individual features.
Examples
--------
The following dataset has integer features, two of which are the same
in every sample. These are removed with the default setting for threshold::
>>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
>>> selector = VarianceThreshold()
>>> selector.fit_transform(X)
array([[2, 0],
[1, 4],
[1, 1]])
"""
def __init__(self, threshold=0.):
self.threshold = threshold
def fit(self, X, y=None):
"""Learn empirical variances from X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Sample vectors from which to compute variances.
y : any
Ignored. This parameter exists only for compatibility with
sklearn.pipeline.Pipeline.
Returns
-------
self
"""
X = check_array(X, ('csr', 'csc'), dtype=np.float64)
if hasattr(X, "toarray"): # sparse matrix
_, self.variances_ = mean_variance_axis(X, axis=0)
else:
self.variances_ = np.var(X, axis=0)
return self
def _get_support_mask(self):
check_is_fitted(self, 'variances_')
support_mask = self.variances_ > np.percentile(self.variances_, self.threshold)
return support_mask