def incremental_search(X, y, k, measure, copula=None):
    """Compute all subsets of length less or equal to k, selected by incremental search.
        Reference: "Feature Selection Based on Mutual Information: Criteria of Max-Dependency, Max-Relevance, and Min-Redundancy", §2.2, Peng et al.

    :X: dataset features
    :y: dataset labels
    :k: maximum size of subsets
    :measure: dependency measure (instance of DependencyMeasure)
    :copula: copula distribution (optional)
    :returns: list of subsets of sizes 1 to k

    """
    S = []
    subsets = []
    m = X.shape[1]
    Y = y[:, np.newaxis] if y.ndim == 1 else y

    X_ = np.c_[X, Y]

    if measure.measure == 'copula':
        Z = approx_copula(X_) if copula is None else copula
    else:
        Z = X_

    if measure.measure == 'hsic':
        L = measure.label_kernel(Y, Y)
        np.fill_diagonal(L, 0)
        L_ones = L.dot(np.ones(X.shape[0]))
    else:
        L = None
        L_ones = None

    for i in trange(k, leave=False):
        best_score = -np.inf
        best_feature = -1
        if i == 0:
            for j in range(m):
                score = measure.score(Z[:, j], Z[:, -1], L, L_ones)
                if score > best_score:
                    best_score = score
                    best_feature = j
        else:
            for j in (set(np.arange(m)) - set(S)):
                score = 0.0
                for s in S:
                    score += measure.score(Z[:, j], Z[:, s], L, L_ones)
                score = -score / i
                score += measure.score(Z[:, j], Z[:, -1], L, L_ones)
                if score > best_score:
                    best_score = score
                    best_feature = j

        S.append(best_feature)
        subsets.append(copy.deepcopy(S))

    return subsets
Пример #2
0
import pickle

import numpy as np
from sklearn import ensemble, linear_model, svm
from sklearn.datasets import load_breast_cancer
import sklearn.metrics.pairwise as sk

import benchmark_tools as bm
from copula_dependency import approx_copula
from feature_selection_algorithms import heuristic_selection
from measure import DependencyMeasure

breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target
breast_copula = approx_copula(np.c_[X, y])

gb_params = {
    'loss': 'deviance',
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 1.0,
    'max_depth': 3
}
svm_params = {'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'}

results_algs = {}
for measure_name in ['hsic', 'copula']:
    gb_clf = ensemble.GradientBoostingClassifier(**gb_params)
    svm_clf = svm.SVC(**svm_params)
    classifiers = [('gb', gb_clf), ('svm', svm_clf)]
    results_algs[measure_name] = bm.compare_algorithms(
def forward_selection(X,
                      y,
                      t,
                      measure,
                      estimators,
                      cv,
                      regression=True,
                      copula=None):
    """Implement Forward Elimination algorithm for Feature Selection
        Reference: "Feature Selection via Dependence Maximization", §4.1, Le Sing, Smola, Gretton, Bedo, Borgwardt

    :X: dataset features
    :y: dataset labels
    :t: desired number of features
    :measure: dependency measure (instance of DependencyMeasure)
    :estimators: list of estimators used to select the best subset (list of tuples (est_name, est))
    :cv: number of folds for cross-validation
    :regression: boolean, True if the task is a regression, False if it is a classification
    :copula: copula distribution (optional)
    :returns: dict whose keys are estimators names and values are a tuple (best subset, cv mean, cv std)

    """
    S = set(range(X.shape[1]))
    T = list()

    Y = y[:, np.newaxis] if y.ndim == 1 else y

    if measure.measure == 'copula':
        X = approx_copula(X) if copula is None else copula[:, :-1]
        Y = approx_copula(y)

    if measure.measure == 'hsic':
        L = measure.label_kernel(Y, Y)
        np.fill_diagonal(L, 0)
        L_ones = L.dot(np.ones(X.shape[0]))
    else:
        L = None
        L_ones = None

    while len(S) > 1:
        subset_size = int(math.ceil(0.1 * len(S)))
        best_score_sum = -np.inf
        best_subset = None
        for subset in tqdm(combinations(S, subset_size),
                           total=int(binom(len(S), subset_size)),
                           leave=False):
            subset = set(subset)
            score_sum = 0.0
            for j in subset:
                feats = np.array(T + [j])
                score_sum += measure.score(X[:, feats], Y, L, L_ones)
            if score_sum > best_score_sum:
                best_score_sum = score_sum
                best_subset = subset
        S = S - best_subset
        T = T + list(best_subset)

    T = T[:t]

    cv_scores = {}

    scoring = 'neg_mean_squared_error' if regression else 'accuracy'
    for est_name, est in estimators:
        scores = cross_val_score(est, X[:, T], y, cv=cv, scoring=scoring)
        cv_scores[est_name] = (T, scores.mean(), scores.std())

    return cv_scores
Пример #4
0
import pickle

import numpy as np
from sklearn import ensemble, linear_model, svm
from sklearn.datasets import load_boston
import sklearn.metrics.pairwise as sk

import benchmark_tools as bm
from copula_dependency import approx_copula
from feature_selection_algorithms import heuristic_selection
from measure import DependencyMeasure


boston = load_boston()
X, y = boston.data, boston.target
boston_copula = approx_copula(np.c_[X, y])

gb_params = {'loss': 'ls', 'learning_rate': 0.1, 
             'n_estimators': 100, 'subsample': 1.0, 'max_depth': 3}
svm_params = {'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'}

results_algs = {}
for measure_name in ['hsic', 'copula']:
    gb_clf = ensemble.GradientBoostingRegressor(**gb_params)
    svm_clf = svm.SVR(**svm_params)
    classifiers = [('gb', gb_clf), ('svm', svm_clf)]
    results_algs[measure_name] = bm.compare_algorithms(X, y, estimators=classifiers, 
            measure_name=measure_name, copula=boston_copula, regression=True)

results_kerns = {}
for measure_name in ['hsic', 'copula', 'mutual_information']: