示例#1
0
    def _get_support_mask(self):
        if hasattr(self, 'estimator_'):
            if isinstance(self.estimator_, dict):
                estimators = self.estimator_
            else:
                estimators = self.estimator_.estimators_
        else:
            raise NotFittedError('Fit the model before transform')

        # if len(estimators) is already 1, no further feature selection reasonable
        if self.criterion is None or len(estimators) == 1:
            if len(estimators) == 1:
                warn('Skipping ROI feature selection, because otherwise no ROI would be left.')
            return list(estimators.keys())
        else:
            scores = dict()
            for roi_id, estimator in estimators.items():
                scores[roi_id] = np.mean(_get_feature_importances(estimator))

            scores_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True)

            if self.criterion < 1:  # proportion
                return [x[0] for x in scores_sorted[:max(1, round(self.criterion * len(scores)))]]
            else:
                return [x[0] for x in scores_sorted[:self.criterion]]
示例#2
0
    def _get_support_mask(self):
        if hasattr(self, 'estimator_'):
            if isinstance(self.estimator_, dict):
                estimators = self.estimator_
            else:
                estimators = self.estimator_.estimators_
        else:
            raise NotFittedError('Fit the model before transform')

        # if len(estimators) is already 1, no further feature selection reasonable
        if self.criterion is None or len(estimators) == 1:
            if len(estimators) == 1:
                warn('Skipping ROI feature selection, because otherwise no ROI would be left.')
            return list(estimators.keys())
        else:
            scores = dict()
            for roi_id, estimator in estimators.items():
                scores[roi_id] = np.mean(_get_feature_importances(estimator))

            scores_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True)

            if self.criterion < 1:  # proportion
                return [x[0] for x in scores_sorted[:max(1, round(self.criterion * len(scores)))]]
            else:
                return [x[0] for x in scores_sorted[:self.criterion]]
 def _get_support_mask(self):
     if self.prefit:
         estimator = self.estimator
     elif hasattr(self, 'estimator_'):
         estimator = self.estimator_
     else:
         raise ValueError(
             'Either fit SelectFromModel before transform or set "prefit='
             'True" and pass a fitted estimator to the constructor.')
     scores = _get_feature_importances(estimator)
     threshold = np.sort(scores)[-self.n_selected]
     return scores >= threshold
示例#4
0
    def _get_support_mask(self):
        if hasattr(self, 'estimator_'):
            if isinstance(self.estimator_, dict):
                estimators = self.estimator_
            else:
                estimators = self.estimator_.estimators_
        else:
            raise NotFittedError('Fit the model before transform')

        self.threshold_, masks = dict(), dict()
        for roi_id, estimator in estimators.items():
            score_ = _get_feature_importances(estimator)
            self.threshold_[roi_id] = _calculate_threshold(estimator, score_, self.threshold)
            masks[roi_id] = np.atleast_1d(score_ >= self.threshold_[roi_id])
        return masks
示例#5
0
    def _get_support_mask(self):
        if hasattr(self, 'estimator_'):
            if isinstance(self.estimator_, dict):
                estimators = self.estimator_
            else:
                estimators = self.estimator_.estimators_
        else:
            raise NotFittedError('Fit the model before transform')

        self.threshold_, masks = dict(), dict()
        for roi_id, estimator in estimators.items():
            score_ = _get_feature_importances(estimator)
            self.threshold_[roi_id] = _calculate_threshold(estimator, score_, self.threshold)
            masks[roi_id] = np.atleast_1d(score_ >= self.threshold_[roi_id])
        return masks
示例#6
0
def my_feature_importance(my_pipeline, accuracy_scorer, X, y):

    try:
        return _get_feature_importances(my_pipeline.named_steps['clf'])
    except:

        def score(X, y):
            return accuracy_scorer(my_pipeline, X, y)

        base_score, score_decreases = get_score_importances(score,
                                                            X,
                                                            y,
                                                            n_iter=5)
        feature_importances = np.mean(score_decreases, axis=0)

        return feature_importances
示例#7
0
 def print_columns_to_drop(self):
     """Print information if and why a column was dropped."""
     for ci, cn in zip(self._indices_to_drop, self.columns_to_drop):
         selector_attributes = self.selector.__dict__
         if 'threshold' in selector_attributes:
             # VarianceThreshold
             if self.selector.__class__.__name__ == 'VarianceThreshold':
                 print(("The variance of column '%s' (%0.4f) is " +
                        "below the threshold of %0.4f")
                       % (cn, self.selector.variances_[ci],
                          self.selector.threshold))
             # SelectFromModel
             if self.selector.__class__.__name__ == 'SelectFromModel':
                 # The fitted estimator ends with an underscore
                 estimator = self.selector.estimator_
                 print(("The feature importance of column '%s' " +
                        "(%0.4f) is below the threshold of %0.4f")
                       % (cn, _get_feature_importances(estimator)[ci],
                          self.selector.threshold))
         elif 'percentile' in selector_attributes:
             # SelectPercentile
             print(("The feature importance of column '%s' (%0.4f) is " +
                    "out of the %d%% of features to keep")
                   % (cn, self.selector.scores_[ci],
                      self.selector.percentile))
         elif 'alpha' in selector_attributes:
             # SelectFpr, SelectFdr, SelectFwe
             print(("The p-value of column '%s' (%0.4f) is above the " +
                    "specified alpha of %0.4f")
                   % (cn, self.selector.pvalues_[ci], self.selector.alpha))
         elif 'k' in selector_attributes:
             # SelectKBest
             print(("The feature importance of column '%s' (%0.4f) is " +
                    "too low to end up in the %d best features")
                   % (cn, self.selector.scores_[ci], self.selector.k))
         elif 'n_features_to_select' in selector_attributes:
             # RFE
             estimator = self.selector.estimator_
             print(("The feature importance of column '%s' is " +
                    "too low to end up in the %d best features")
                   % (cn, self.selector.n_features_to_select))
         elif 'min_features_to_select' in selector_attributes:
             # RFECV
             estimator = self.selector.estimator_
             print(("The feature importance of column '%s' is " +
                    "too low to end up in the %d best features")
                   % (cn, self.selector.min_features_to_select))
示例#8
0
    def _get_support_mask(self):
        # SelectFromModelPercentile can directly call on transform.
        if self.prefit:
            estimator = self.estimator
        elif hasattr(self, 'estimator_'):
            estimator = self.estimator_
        else:
            raise ValueError(
                'Either fit SelectFromModelPercentile before transform or set "prefit='
                'True" and pass a fitted estimator to the constructor.')
        scores = _get_feature_importances(estimator)
        sz = int(len(scores) * self.percentile)
        I = scores.argsort()[-sz:]

        threshold = scores * 0.0
        threshold[I] = 1.0
        threshold = threshold > 0.5

        return threshold
示例#9
0
    def calculate_features_importance(self):
        x_train, y_train, x_test, y_test = self.create_train_test_data()

        Verbose.instance.print(
            1, f'Calculating importance for {len(self._features)} features')

        self._estimator.fit(x_train, y_train)
        ranks = np.nan_to_num(_get_feature_importances(self._estimator))

        self._ranks = list(
            map(lambda x: 0. if x < self._threshold else x, ranks))

        self.logger.log({
            'date': datetime.datetime.now(),
            'name': self.name,
            'estimator': self._estimator.__class__.__name__,
            'estimator_args': json.dumps(self._estimator_args),
            'all_features': json.dumps(self._features),
            'ranking': json.dumps(self.ranks)
        })
示例#10
0
    def _get_support_mask(self):
        if self.threshold is not None:
            return super()._get_support_mask()
        elif self.n_features is not None:
            if self.prefit:
                estimator = self.estimator
            elif hasattr(self, 'estimator_'):
                estimator = self.estimator_
            else:
                raise ValueError(
                    'Either fit SelectFromModel before transform or set "prefit='
                    'True" and pass a fitted estimator to the constructor.')

            score = _get_feature_importances(estimator, self.norm_order)
            n_largest = np.argsort(score)[-self.n_features:]
            mask = np.zeros(len(score), dtype=bool)
            mask[n_largest] = True
            return mask

        else:
            raise ValueError('Got threshold={} and n_features={}. '
                             'Either of them needs to be specified.'.format(
                                 self.threshold, self.n_features))
示例#11
0
def model_score(X, y=None, estimator=None):
    estimator.fit(X, y)
    scores = _get_feature_importances(estimator)
    return scores
示例#12
0
 def _get_support_mask(self):
     check_is_fitted(self, "n_iter_")
     scores = _get_feature_importances(self)
     self.threshold_ = _calculate_threshold(self, scores, self.threshold)
     return scores >= self.threshold_
示例#13
0
 def _get_support_mask(self):
     check_is_fitted(self, "n_iter_")
     scores = _get_feature_importances(self)
     self.threshold_ = _calculate_threshold(self, scores, self.threshold)
     return scores >= self.threshold_
 def threshold_(self):
     scores = _get_feature_importances(self.estimator_, )
     return np.sort(scores)[-n_selected]
 def scores_(self):
     scores = _get_feature_importances(self.estimator_, )
     return scores
def recursive_feature_elimination(X_train,
                                  X_validation,
                                  X_test,
                                  y_train,
                                  y_validation,
                                  y_test,
                                  names,
                                  sensitive_ids,
                                  ranking_functions=[],
                                  clf=None,
                                  min_accuracy=0.0,
                                  min_fairness=0.0,
                                  min_robustness=0.0,
                                  max_number_features=None,
                                  max_search_time=np.inf,
                                  log_file=None):
    f_log = open(log_file, 'wb+')
    min_loss = np.inf
    start_time = time.time()

    auc_scorer = make_scorer(roc_auc_score,
                             greater_is_better=True,
                             needs_threshold=True)

    fair_validation = None
    fair_test = None
    if type(sensitive_ids) != type(None):
        fair_validation = make_scorer(
            true_positive_rate_score,
            greater_is_better=True,
            sensitive_data=X_validation[:, sensitive_ids[0]])
        fair_test = make_scorer(true_positive_rate_score,
                                greater_is_better=True,
                                sensitive_data=X_test[:, sensitive_ids[0]])

    def f_clf1(hps):
        mask = np.zeros(len(hps), dtype=bool)
        for k, v in hps.items():
            mask[int(k.split('_')[1])] = v

        for mask_i in range(len(mask)):
            hps['f_' + str(mask_i)] = mask[mask_i]

        model = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)])

        return model, hps

    def f_to_min1(hps):
        pipeline, hps = f_clf1(hps)

        if np.sum(pipeline.named_steps['selection'].mask) == 0:
            return {
                'loss': 4,
                'model': pipeline,
                'cv_fair': 0.0,
                'cv_acc': 0.0,
                'cv_robust': 0.0,
                'cv_number_features': 1.0
            }

        pipeline.fit(X_train, pd.DataFrame(y_train))

        validation_number_features = float(
            np.sum(pipeline.named_steps['selection']._get_support_mask())
        ) / float(X_train.shape[1])
        validation_acc = auc_scorer(pipeline, X_validation,
                                    pd.DataFrame(y_validation))

        validation_fair = 0.0
        if type(sensitive_ids) != type(None) and min_fairness > 0.0:
            validation_fair = 1.0 - fair_validation(pipeline, X_validation,
                                                    pd.DataFrame(y_validation))
        validation_robust = 0.0
        if min_robustness > 0.0:
            validation_robust = 1.0 - robust_score_test(
                eps=0.1,
                X_test=X_validation,
                y_test=y_validation,
                model=pipeline.named_steps['clf'],
                feature_selector=pipeline.named_steps['selection'],
                scorer=auc_scorer)

        loss = 0.0
        if min_fairness > 0.0 and validation_fair < min_fairness:
            loss += (min_fairness - validation_fair)**2
        if min_accuracy > 0.0 and validation_acc < min_accuracy:
            loss += (min_accuracy - validation_acc)**2
        if min_robustness > 0.0 and validation_robust < min_robustness:
            loss += (min_robustness - validation_robust)**2

        current_time = time.time() - start_time

        return {
            'loss': loss,
            'model': pipeline,
            'cv_fair': validation_fair,
            'cv_acc': validation_acc,
            'cv_robust': validation_robust,
            'cv_number_features': validation_number_features,
            'time': current_time,
            'updated_parameters': hps
        }

    def execute_feature_combo(feature_combo, number_of_evaluations):
        hps = {}
        for f_i in range(X_train.shape[1]):
            if f_i in feature_combo:
                hps['f_' + str(f_i)] = 1
            else:
                hps['f_' + str(f_i)] = 0

        result = f_to_min1(hps)

        cv_fair = result['cv_fair']
        cv_acc = result['cv_acc']
        cv_robust = result['cv_robust']
        cv_number_features = result['cv_number_features']

        my_result = result
        my_result['number_evaluations'] = number_of_evaluations
        if cv_fair >= min_fairness and cv_acc >= min_accuracy and cv_robust >= min_robustness and cv_number_features <= max_number_features:
            model = result['model']

            X_train_val = np.vstack((X_train, X_validation))
            y_train_val = np.append(y_train, y_validation)
            model.fit(X_train_val, pd.DataFrame(y_train_val))

            test_acc = 0.0
            if min_accuracy > 0.0:
                test_acc = auc_scorer(model, X_test, pd.DataFrame(y_test))
            test_fair = 0.0
            if min_fairness > 0.0:
                test_fair = 1.0 - fair_test(model, X_test,
                                            pd.DataFrame(y_test))
            test_robust = 0.0
            if min_robustness > 0.0:
                test_robust = 1.0 - robust_score_test(
                    eps=0.1,
                    X_test=X_test,
                    y_test=y_test,
                    model=model.named_steps['clf'],
                    feature_selector=model.named_steps['selection'],
                    scorer=auc_scorer)

            my_result['test_fair'] = test_fair
            my_result['test_acc'] = test_acc
            my_result['test_robust'] = test_robust
            my_result['final_time'] = time.time() - start_time
            my_result['Finished'] = True

            success = False
            if test_fair >= min_fairness and test_acc >= min_accuracy and test_robust >= min_robustness:
                success = True

            my_result['success_test'] = success
            pickle.dump(my_result, f_log)
            return result, {'success': success}

        return result, {}

    number_of_evaluations = 0

    current_feature_set = list(range(X_train.shape[1]))
    while len(current_feature_set) >= 1:
        # select best feature
        number_of_evaluations += 1
        feature_combo = copy.deepcopy(current_feature_set)
        my_result, combo_result = execute_feature_combo(
            feature_combo, number_of_evaluations)
        if min_loss > my_result['loss']:
            min_loss = my_result['loss']
            pickle.dump(my_result, f_log)

        combo_loss = my_result['loss']
        print('loss: ' + str(combo_loss))
        if len(combo_result) > 0:
            return combo_result

        worst_id = np.argmin(
            _get_feature_importances(my_result['model'].named_steps['clf']))

        current_feature_set.remove(current_feature_set[worst_id])

    my_result = {
        'number_evaluations': number_of_evaluations,
        'success_test': False,
        'time': time.time() - start_time,
        'Finished': True
    }
    pickle.dump(my_result, f_log)
    f_log.close()
    return {'success': False}