def incremental_search(X, y, k, measure, copula=None): """Compute all subsets of length less or equal to k, selected by incremental search. Reference: "Feature Selection Based on Mutual Information: Criteria of Max-Dependency, Max-Relevance, and Min-Redundancy", §2.2, Peng et al. :X: dataset features :y: dataset labels :k: maximum size of subsets :measure: dependency measure (instance of DependencyMeasure) :copula: copula distribution (optional) :returns: list of subsets of sizes 1 to k """ S = [] subsets = [] m = X.shape[1] Y = y[:, np.newaxis] if y.ndim == 1 else y X_ = np.c_[X, Y] if measure.measure == 'copula': Z = approx_copula(X_) if copula is None else copula else: Z = X_ if measure.measure == 'hsic': L = measure.label_kernel(Y, Y) np.fill_diagonal(L, 0) L_ones = L.dot(np.ones(X.shape[0])) else: L = None L_ones = None for i in trange(k, leave=False): best_score = -np.inf best_feature = -1 if i == 0: for j in range(m): score = measure.score(Z[:, j], Z[:, -1], L, L_ones) if score > best_score: best_score = score best_feature = j else: for j in (set(np.arange(m)) - set(S)): score = 0.0 for s in S: score += measure.score(Z[:, j], Z[:, s], L, L_ones) score = -score / i score += measure.score(Z[:, j], Z[:, -1], L, L_ones) if score > best_score: best_score = score best_feature = j S.append(best_feature) subsets.append(copy.deepcopy(S)) return subsets
import pickle import numpy as np from sklearn import ensemble, linear_model, svm from sklearn.datasets import load_breast_cancer import sklearn.metrics.pairwise as sk import benchmark_tools as bm from copula_dependency import approx_copula from feature_selection_algorithms import heuristic_selection from measure import DependencyMeasure breast_cancer = load_breast_cancer() X, y = breast_cancer.data, breast_cancer.target breast_copula = approx_copula(np.c_[X, y]) gb_params = { 'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 1.0, 'max_depth': 3 } svm_params = {'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'} results_algs = {} for measure_name in ['hsic', 'copula']: gb_clf = ensemble.GradientBoostingClassifier(**gb_params) svm_clf = svm.SVC(**svm_params) classifiers = [('gb', gb_clf), ('svm', svm_clf)] results_algs[measure_name] = bm.compare_algorithms(
def forward_selection(X, y, t, measure, estimators, cv, regression=True, copula=None): """Implement Forward Elimination algorithm for Feature Selection Reference: "Feature Selection via Dependence Maximization", §4.1, Le Sing, Smola, Gretton, Bedo, Borgwardt :X: dataset features :y: dataset labels :t: desired number of features :measure: dependency measure (instance of DependencyMeasure) :estimators: list of estimators used to select the best subset (list of tuples (est_name, est)) :cv: number of folds for cross-validation :regression: boolean, True if the task is a regression, False if it is a classification :copula: copula distribution (optional) :returns: dict whose keys are estimators names and values are a tuple (best subset, cv mean, cv std) """ S = set(range(X.shape[1])) T = list() Y = y[:, np.newaxis] if y.ndim == 1 else y if measure.measure == 'copula': X = approx_copula(X) if copula is None else copula[:, :-1] Y = approx_copula(y) if measure.measure == 'hsic': L = measure.label_kernel(Y, Y) np.fill_diagonal(L, 0) L_ones = L.dot(np.ones(X.shape[0])) else: L = None L_ones = None while len(S) > 1: subset_size = int(math.ceil(0.1 * len(S))) best_score_sum = -np.inf best_subset = None for subset in tqdm(combinations(S, subset_size), total=int(binom(len(S), subset_size)), leave=False): subset = set(subset) score_sum = 0.0 for j in subset: feats = np.array(T + [j]) score_sum += measure.score(X[:, feats], Y, L, L_ones) if score_sum > best_score_sum: best_score_sum = score_sum best_subset = subset S = S - best_subset T = T + list(best_subset) T = T[:t] cv_scores = {} scoring = 'neg_mean_squared_error' if regression else 'accuracy' for est_name, est in estimators: scores = cross_val_score(est, X[:, T], y, cv=cv, scoring=scoring) cv_scores[est_name] = (T, scores.mean(), scores.std()) return cv_scores
import pickle import numpy as np from sklearn import ensemble, linear_model, svm from sklearn.datasets import load_boston import sklearn.metrics.pairwise as sk import benchmark_tools as bm from copula_dependency import approx_copula from feature_selection_algorithms import heuristic_selection from measure import DependencyMeasure boston = load_boston() X, y = boston.data, boston.target boston_copula = approx_copula(np.c_[X, y]) gb_params = {'loss': 'ls', 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 1.0, 'max_depth': 3} svm_params = {'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'} results_algs = {} for measure_name in ['hsic', 'copula']: gb_clf = ensemble.GradientBoostingRegressor(**gb_params) svm_clf = svm.SVR(**svm_params) classifiers = [('gb', gb_clf), ('svm', svm_clf)] results_algs[measure_name] = bm.compare_algorithms(X, y, estimators=classifiers, measure_name=measure_name, copula=boston_copula, regression=True) results_kerns = {} for measure_name in ['hsic', 'copula', 'mutual_information']: