def _get_support_mask(self): if hasattr(self, 'estimator_'): if isinstance(self.estimator_, dict): estimators = self.estimator_ else: estimators = self.estimator_.estimators_ else: raise NotFittedError('Fit the model before transform') # if len(estimators) is already 1, no further feature selection reasonable if self.criterion is None or len(estimators) == 1: if len(estimators) == 1: warn('Skipping ROI feature selection, because otherwise no ROI would be left.') return list(estimators.keys()) else: scores = dict() for roi_id, estimator in estimators.items(): scores[roi_id] = np.mean(_get_feature_importances(estimator)) scores_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True) if self.criterion < 1: # proportion return [x[0] for x in scores_sorted[:max(1, round(self.criterion * len(scores)))]] else: return [x[0] for x in scores_sorted[:self.criterion]]
def _get_support_mask(self): if self.prefit: estimator = self.estimator elif hasattr(self, 'estimator_'): estimator = self.estimator_ else: raise ValueError( 'Either fit SelectFromModel before transform or set "prefit=' 'True" and pass a fitted estimator to the constructor.') scores = _get_feature_importances(estimator) threshold = np.sort(scores)[-self.n_selected] return scores >= threshold
def _get_support_mask(self): if hasattr(self, 'estimator_'): if isinstance(self.estimator_, dict): estimators = self.estimator_ else: estimators = self.estimator_.estimators_ else: raise NotFittedError('Fit the model before transform') self.threshold_, masks = dict(), dict() for roi_id, estimator in estimators.items(): score_ = _get_feature_importances(estimator) self.threshold_[roi_id] = _calculate_threshold(estimator, score_, self.threshold) masks[roi_id] = np.atleast_1d(score_ >= self.threshold_[roi_id]) return masks
def my_feature_importance(my_pipeline, accuracy_scorer, X, y): try: return _get_feature_importances(my_pipeline.named_steps['clf']) except: def score(X, y): return accuracy_scorer(my_pipeline, X, y) base_score, score_decreases = get_score_importances(score, X, y, n_iter=5) feature_importances = np.mean(score_decreases, axis=0) return feature_importances
def print_columns_to_drop(self): """Print information if and why a column was dropped.""" for ci, cn in zip(self._indices_to_drop, self.columns_to_drop): selector_attributes = self.selector.__dict__ if 'threshold' in selector_attributes: # VarianceThreshold if self.selector.__class__.__name__ == 'VarianceThreshold': print(("The variance of column '%s' (%0.4f) is " + "below the threshold of %0.4f") % (cn, self.selector.variances_[ci], self.selector.threshold)) # SelectFromModel if self.selector.__class__.__name__ == 'SelectFromModel': # The fitted estimator ends with an underscore estimator = self.selector.estimator_ print(("The feature importance of column '%s' " + "(%0.4f) is below the threshold of %0.4f") % (cn, _get_feature_importances(estimator)[ci], self.selector.threshold)) elif 'percentile' in selector_attributes: # SelectPercentile print(("The feature importance of column '%s' (%0.4f) is " + "out of the %d%% of features to keep") % (cn, self.selector.scores_[ci], self.selector.percentile)) elif 'alpha' in selector_attributes: # SelectFpr, SelectFdr, SelectFwe print(("The p-value of column '%s' (%0.4f) is above the " + "specified alpha of %0.4f") % (cn, self.selector.pvalues_[ci], self.selector.alpha)) elif 'k' in selector_attributes: # SelectKBest print(("The feature importance of column '%s' (%0.4f) is " + "too low to end up in the %d best features") % (cn, self.selector.scores_[ci], self.selector.k)) elif 'n_features_to_select' in selector_attributes: # RFE estimator = self.selector.estimator_ print(("The feature importance of column '%s' is " + "too low to end up in the %d best features") % (cn, self.selector.n_features_to_select)) elif 'min_features_to_select' in selector_attributes: # RFECV estimator = self.selector.estimator_ print(("The feature importance of column '%s' is " + "too low to end up in the %d best features") % (cn, self.selector.min_features_to_select))
def _get_support_mask(self): # SelectFromModelPercentile can directly call on transform. if self.prefit: estimator = self.estimator elif hasattr(self, 'estimator_'): estimator = self.estimator_ else: raise ValueError( 'Either fit SelectFromModelPercentile before transform or set "prefit=' 'True" and pass a fitted estimator to the constructor.') scores = _get_feature_importances(estimator) sz = int(len(scores) * self.percentile) I = scores.argsort()[-sz:] threshold = scores * 0.0 threshold[I] = 1.0 threshold = threshold > 0.5 return threshold
def calculate_features_importance(self): x_train, y_train, x_test, y_test = self.create_train_test_data() Verbose.instance.print( 1, f'Calculating importance for {len(self._features)} features') self._estimator.fit(x_train, y_train) ranks = np.nan_to_num(_get_feature_importances(self._estimator)) self._ranks = list( map(lambda x: 0. if x < self._threshold else x, ranks)) self.logger.log({ 'date': datetime.datetime.now(), 'name': self.name, 'estimator': self._estimator.__class__.__name__, 'estimator_args': json.dumps(self._estimator_args), 'all_features': json.dumps(self._features), 'ranking': json.dumps(self.ranks) })
def _get_support_mask(self): if self.threshold is not None: return super()._get_support_mask() elif self.n_features is not None: if self.prefit: estimator = self.estimator elif hasattr(self, 'estimator_'): estimator = self.estimator_ else: raise ValueError( 'Either fit SelectFromModel before transform or set "prefit=' 'True" and pass a fitted estimator to the constructor.') score = _get_feature_importances(estimator, self.norm_order) n_largest = np.argsort(score)[-self.n_features:] mask = np.zeros(len(score), dtype=bool) mask[n_largest] = True return mask else: raise ValueError('Got threshold={} and n_features={}. ' 'Either of them needs to be specified.'.format( self.threshold, self.n_features))
def model_score(X, y=None, estimator=None): estimator.fit(X, y) scores = _get_feature_importances(estimator) return scores
def _get_support_mask(self): check_is_fitted(self, "n_iter_") scores = _get_feature_importances(self) self.threshold_ = _calculate_threshold(self, scores, self.threshold) return scores >= self.threshold_
def threshold_(self): scores = _get_feature_importances(self.estimator_, ) return np.sort(scores)[-n_selected]
def scores_(self): scores = _get_feature_importances(self.estimator_, ) return scores
def recursive_feature_elimination(X_train, X_validation, X_test, y_train, y_validation, y_test, names, sensitive_ids, ranking_functions=[], clf=None, min_accuracy=0.0, min_fairness=0.0, min_robustness=0.0, max_number_features=None, max_search_time=np.inf, log_file=None): f_log = open(log_file, 'wb+') min_loss = np.inf start_time = time.time() auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) fair_validation = None fair_test = None if type(sensitive_ids) != type(None): fair_validation = make_scorer( true_positive_rate_score, greater_is_better=True, sensitive_data=X_validation[:, sensitive_ids[0]]) fair_test = make_scorer(true_positive_rate_score, greater_is_better=True, sensitive_data=X_test[:, sensitive_ids[0]]) def f_clf1(hps): mask = np.zeros(len(hps), dtype=bool) for k, v in hps.items(): mask[int(k.split('_')[1])] = v for mask_i in range(len(mask)): hps['f_' + str(mask_i)] = mask[mask_i] model = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)]) return model, hps def f_to_min1(hps): pipeline, hps = f_clf1(hps) if np.sum(pipeline.named_steps['selection'].mask) == 0: return { 'loss': 4, 'model': pipeline, 'cv_fair': 0.0, 'cv_acc': 0.0, 'cv_robust': 0.0, 'cv_number_features': 1.0 } pipeline.fit(X_train, pd.DataFrame(y_train)) validation_number_features = float( np.sum(pipeline.named_steps['selection']._get_support_mask()) ) / float(X_train.shape[1]) validation_acc = auc_scorer(pipeline, X_validation, pd.DataFrame(y_validation)) validation_fair = 0.0 if type(sensitive_ids) != type(None) and min_fairness > 0.0: validation_fair = 1.0 - fair_validation(pipeline, X_validation, pd.DataFrame(y_validation)) validation_robust = 0.0 if min_robustness > 0.0: validation_robust = 1.0 - robust_score_test( eps=0.1, X_test=X_validation, y_test=y_validation, model=pipeline.named_steps['clf'], feature_selector=pipeline.named_steps['selection'], scorer=auc_scorer) loss = 0.0 if min_fairness > 0.0 and validation_fair < min_fairness: loss += (min_fairness - validation_fair)**2 if min_accuracy > 0.0 and validation_acc < min_accuracy: loss += (min_accuracy - validation_acc)**2 if min_robustness > 0.0 and validation_robust < min_robustness: loss += (min_robustness - validation_robust)**2 current_time = time.time() - start_time return { 'loss': loss, 'model': pipeline, 'cv_fair': validation_fair, 'cv_acc': validation_acc, 'cv_robust': validation_robust, 'cv_number_features': validation_number_features, 'time': current_time, 'updated_parameters': hps } def execute_feature_combo(feature_combo, number_of_evaluations): hps = {} for f_i in range(X_train.shape[1]): if f_i in feature_combo: hps['f_' + str(f_i)] = 1 else: hps['f_' + str(f_i)] = 0 result = f_to_min1(hps) cv_fair = result['cv_fair'] cv_acc = result['cv_acc'] cv_robust = result['cv_robust'] cv_number_features = result['cv_number_features'] my_result = result my_result['number_evaluations'] = number_of_evaluations if cv_fair >= min_fairness and cv_acc >= min_accuracy and cv_robust >= min_robustness and cv_number_features <= max_number_features: model = result['model'] X_train_val = np.vstack((X_train, X_validation)) y_train_val = np.append(y_train, y_validation) model.fit(X_train_val, pd.DataFrame(y_train_val)) test_acc = 0.0 if min_accuracy > 0.0: test_acc = auc_scorer(model, X_test, pd.DataFrame(y_test)) test_fair = 0.0 if min_fairness > 0.0: test_fair = 1.0 - fair_test(model, X_test, pd.DataFrame(y_test)) test_robust = 0.0 if min_robustness > 0.0: test_robust = 1.0 - robust_score_test( eps=0.1, X_test=X_test, y_test=y_test, model=model.named_steps['clf'], feature_selector=model.named_steps['selection'], scorer=auc_scorer) my_result['test_fair'] = test_fair my_result['test_acc'] = test_acc my_result['test_robust'] = test_robust my_result['final_time'] = time.time() - start_time my_result['Finished'] = True success = False if test_fair >= min_fairness and test_acc >= min_accuracy and test_robust >= min_robustness: success = True my_result['success_test'] = success pickle.dump(my_result, f_log) return result, {'success': success} return result, {} number_of_evaluations = 0 current_feature_set = list(range(X_train.shape[1])) while len(current_feature_set) >= 1: # select best feature number_of_evaluations += 1 feature_combo = copy.deepcopy(current_feature_set) my_result, combo_result = execute_feature_combo( feature_combo, number_of_evaluations) if min_loss > my_result['loss']: min_loss = my_result['loss'] pickle.dump(my_result, f_log) combo_loss = my_result['loss'] print('loss: ' + str(combo_loss)) if len(combo_result) > 0: return combo_result worst_id = np.argmin( _get_feature_importances(my_result['model'].named_steps['clf'])) current_feature_set.remove(current_feature_set[worst_id]) my_result = { 'number_evaluations': number_of_evaluations, 'success_test': False, 'time': time.time() - start_time, 'Finished': True } pickle.dump(my_result, f_log) f_log.close() return {'success': False}