def run(): allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = ["KWH", "KWHSPH", "KWHCOL", "KWHWTH", "KWHRFG", "KWHOTH", "BTUEL", "BTUELSPH", "BTUELCOL", "BTUELWTH", "BTUELRFG","BTUELOTH", "DOLLAREL", "DOLELSPH", "DOLELCOL", "DOLELWTH", "DOLELRFG", "DOLELOTH", "TOTALBTUOTH", "TOTALBTUCOL", 'TOTALBTU', 'TOTALBTUWTH', 'TOTALBTU', 'TOTALBTUSPH', 'TOTALBTURFG', 'TOTALDOL', 'TOTALDOLSPH', 'TOTALDOLCOL', 'TOTALDOLWTH', 'TOTALDOLRFG', 'TOTALDOLOTH']) #allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = [], forceUse = # [ # 'WGTP', 'NP', 'TYPE', 'ACR', 'BDSP', 'BATH', 'FS','MHP', 'RMSP', 'RNTP', 'REFR', 'RNTP', 'RWAT', 'STOV', 'TEN', 'VALP', 'YBL', 'FES', 'FINCP', 'HINCP', 'HHT', 'KIT', 'NOC', 'NPF', 'PLM', 'SRNT', 'SVAL', 'TAXP', 'WIF', 'WORKSTAT', # ]) clf = RandomForestRegressor(n_estimators = 100, n_jobs = 7) clf.fit(X, y) model = SelectFromModel(clf, prefit = True) X = model.transform(X) relevantFeatures = [allKeys[i] for i in range(len(model._get_support_mask())) if model._get_support_mask()[i] == True] print("Relevant Features", relevantFeatures) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf.fit(X_train, y_train) print(y_test[:100]) print(metrics.mean_squared_error(clf.predict(X_test), y_test)) features = sorted(zip(allKeys, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features)
def test_sample_weight(): # Ensure sample weights are passed to underlying estimator X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, ) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = LogisticRegression(random_state=0, fit_intercept=False) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=None) mask = transformer._get_support_mask() transformer.fit(X, y, sample_weight=sample_weight) weighted_mask = transformer._get_support_mask() assert not np.all(weighted_mask == mask) transformer.fit(X, y, sample_weight=3 * sample_weight) reweighted_mask = transformer._get_support_mask() assert np.all(weighted_mask == reweighted_mask)
def _get_support_mask(self): if self._cache_support_mask is not None: return self._cache_support_mask if self.prefit: estimator = self.estimator elif hasattr(self, 'estimator_'): estimator = self.estimator_ else: raise ValueError( 'Either fit the model before transform or set "prefit=True"' ' while passing the fitted estimator to the constructor.') try: with np.errstate(divide='ignore', invalid='ignore'): importances = getattr(estimator, "feature_importances_", None) if importances is not None and np.isnan(importances).all(): mask = np.ones(importances.shape, bool) else: mask = super(_SelectFromModel, self)._get_support_mask() except ValueError: sfm = SelectFromModel(estimator.estimator_, self.threshold, True) mask = sfm._get_support_mask() for i in self._out_mask: mask[i] = False for i in self._in_mask: mask[i] = True self._cache_support_mask = mask return mask
class SelectFromLinearSVC(AbstractFeatureSelector): param_distributions = { 'threshold': (1e-5,), 'C': [float(x) for x in np.logspace(-2, 5, 100)] } def __init__(self, threshold=None, penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, fit_intercept=True, random_state=None, max_iter=1000): self.threshold = threshold self.penalty = penalty self.loss = loss self.dual = dual self.tol = tol self.C = C self.fit_intercept = fit_intercept self.random_state = random_state self.max_iter = max_iter def fit(self, X, y): self.linear_svc = LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, fit_intercept=self.fit_intercept, random_state=self.random_state, max_iter=self.max_iter) self.linear_svc.fit(X, y) self.select_from_model = SelectFromModel(self.linear_svc, threshold=self.threshold, prefit=True) return self def _get_support_mask(self): return self.select_from_model._get_support_mask()
class SelectFromLasso(AbstractFeatureSelector): param_distributions = { 'threshold': (1e-5,), 'alpha': [float(x) for x in np.logspace(-5, 2, 100)] } def __init__(self, threshold=None, alpha=1.0, fit_intercept=True, normalize=False, max_iter=1000, tol=0.0001, positive=False, selection='cyclic', random_state=None): self.threshold = threshold self.alpha = alpha self.fit_intercept = fit_intercept self.normalize = normalize self.max_iter = max_iter self.tol = tol self.positive = positive self.selection = selection self.random_state = random_state def fit(self, X, y): # NOTE: y is an ndarray of strings self.lasso = Lasso(alpha=self.alpha, fit_intercept=self.fit_intercept, normalize=self.normalize, max_iter=self.max_iter, tol=self.tol, positive=self.positive, selection=self.selection, random_state=self.random_state) self.lasso.fit(X, y) self.select_from_model = SelectFromModel(self.lasso, threshold=self.threshold, prefit=True) return self def _get_support_mask(self): return self.select_from_model._get_support_mask()
def test_sample_weight(): # Ensure sample weights are passed to underlying estimator X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = LogisticRegression(random_state=0, fit_intercept=False) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=None) mask = transformer._get_support_mask() transformer.fit(X, y, sample_weight=sample_weight) weighted_mask = transformer._get_support_mask() assert not np.all(weighted_mask == mask) transformer.fit(X, y, sample_weight=3 * sample_weight) reweighted_mask = transformer._get_support_mask() assert np.all(weighted_mask == reweighted_mask)
def test_filter_prefit(self): regressor = DecisionTreeRegressor() regressor.fit(numpy.array([[0, 1], [0, 2], [0, 3]]), numpy.array([0.5, 1.0, 1.5])) selector = SelectFromModel(regressor, prefit = True) self.assertTrue(hasattr(selector, "estimator")) self.assertFalse(hasattr(selector, "estimator_")) selector = _filter_steps([("selector", selector, {})])[0][1] self.assertIsInstance(selector, SelectorProxy) self.assertIsInstance(selector.estimator, EstimatorProxy) self.assertFalse(hasattr(selector, "estimator_")) self.assertEqual([0, 1], selector._get_support_mask().tolist())
def test_max_features_tiebreak(): # Test if max_features can break tie among feature importance X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) max_features = X.shape[1] feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1]) for n_features in range(1, max_features + 1): transformer = SelectFromModel( FixedImportanceEstimator(feature_importances), max_features=n_features, threshold=-np.inf) X_new = transformer.fit_transform(X, y) selected_feature_indices = np.where(transformer._get_support_mask())[0] assert_array_equal(selected_feature_indices, np.arange(n_features)) assert X_new.shape[1] == n_features
def _get_support_mask(self): try: mask = super(_SelectFromModel, self)._get_support_mask() except ValueError: # SelectFromModel can directly call on transform. if self.prefit: estimator = self.estimator elif hasattr(self, 'estimator_'): estimator = self.estimator_ else: raise ValueError( 'Either fit the model before transform or set "prefit=True"' ' while passing the fitted estimator to the constructor.') sfm = SelectFromModel(estimator.estimator_, self.threshold, True) mask = sfm._get_support_mask() for i in self._out_mask: mask[i] = False for i in self._in_mask: mask[i] = True return mask
def check_max_features(est, X, y): X = X.copy() max_features = X.shape[1] check_valid_max_features(est, X, y) check_invalid_max_features(est, X, y) transformer1 = SelectFromModel(estimator=est, max_features='all') transformer2 = SelectFromModel(estimator=est, max_features=max_features) X_new1 = transformer1.fit_transform(X, y) X_new2 = transformer2.fit_transform(X, y) assert_array_equal(X_new1, X_new2) # Test max_features against actual model. transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025)) X_new1 = transformer1.fit_transform(X, y) for n_features in range(1, X_new1.shape[1] + 1): transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025), max_features=n_features) X_new2 = transformer2.fit_transform(X, y) assert_array_equal(X_new1[:, :n_features], X_new2) assert_array_equal(transformer1.estimator_.coef_, transformer2.estimator_.coef_) # Test if max_features can break tie among feature importance feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1]) for n_features in range(1, max_features + 1): transformer = SelectFromModel( FixedImportanceEstimator(feature_importances), max_features=n_features) X_new = transformer.fit_transform(X, y) selected_feature_indices = np.where(transformer._get_support_mask())[0] assert_array_equal(selected_feature_indices, np.arange(n_features)) assert_equal(X_new.shape[1], n_features)
n_redundant=0, n_repeated=0, n_classes=2, random_state=0, shuffle=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # Build a forest using all features and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X_train, y_train) # Calculate model performance based on all features y_pred, y_prob = forest.predict(X_test), forest.predict_proba(X_test) ScoreAll = roc_auc_score(y_test, y_prob[:, 1]) # Select certain features based on ExtraTrees importances importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] model = SelectFromModel(forest, prefit=True) X_test_single = model.transform(X) # Calculate model performance based on subset of features from single run y_pred, y_prob = forest.predict(X_test), forest.predict_proba(X_test) roc_auc_score(y_test, y_prob[:, 1]) # Set the initial global ratings global_ratings = 1000 * np.ones(X.shape[1]) X_mask = model._get_support_mask() global_ratings[X_mask] += 1 global_ratings[~X_mask] -= 1
def feature_selection(tfidf: np.array, titles: np.array, inv_reg: float = 3, random_state: int = 1, name='nyc', penalty: str = 'l1', bootstrap: int = 0, proportion=0.8, refresh=False) -> tuple: """ :param tfidf: A numpy array containing 'TFIDF' info :param titles: A list of titles :param inv_reg: the inverse regularization coefficient :param random_state: A random seed that starts for the logistic regression :param name: the name of the dataset (used to name the pickle file) :param penalty: The type of penalty to do features selection eg: 'l1' or 'elasticnet' :param bootstrap: The number of times to bootstrap, or 0 to not do bootstrapping :param refresh: choose whether to refresh the cache. :param proportion: Proportion of models the feature must appear in before putting it into our model :return: A list of words to ignore """ l1_ratio = L1RATIO if penalty == 'elasticnet' else None if not bootstrap: logreg = LogisticRegression(random_state=random_state, penalty=penalty, C=inv_reg, solver='saga', multi_class="multinomial", n_jobs=4, max_iter=3000, l1_ratio=l1_ratio) logreg.fit(tfidf, titles) print(f'The accuracy of the model was {logreg.score(tfidf, titles)}') model = SelectFromModel(logreg, prefit=True) mask = model._get_support_mask() else: dump_path = f"temp/cache/bootstrap_{name}_{penalty}_{inv_reg if isinstance(inv_reg, int) else 'r' + str(int(100 / inv_reg))}_{bootstrap}" if not refresh and os.path.exists(dump_path): masks = load(dump_path) else: masks = [] temp_logreg = LogisticRegression(penalty=penalty, C=inv_reg, solver='saga', multi_class="multinomial", n_jobs=4, max_iter=700, l1_ratio=l1_ratio) for i in range(bootstrap): temp_tfidf, temp_titles = resample(tfidf, titles, random_state=i) temp_logreg.random_state = random_state + i temp_logreg.fit(temp_tfidf, temp_titles) print( f'The accuracy of model {i + 1} was {temp_logreg.score(tfidf, titles)}' ) model = SelectFromModel(temp_logreg, prefit=True) masks.append(model._get_support_mask()) with open(dump_path, 'wb') as f: dill.dump(masks, f) mask = combine_masks(masks, proportion) print( f'Number of features has been reduced from {tfidf.shape[1]} to {tfidf[:, mask].shape[1]}' ) filtered_tfidf = tfidf[:, mask] debiased_log = LogisticRegression(random_state=random_state, penalty=penalty, C=inv_reg, solver='saga', multi_class="multinomial", n_jobs=4, max_iter=700, l1_ratio=l1_ratio) debiased_log.fit(filtered_tfidf, titles) for j in range(filtered_tfidf.shape[1]): filtered_tfidf[:, j] *= np.linalg.norm(debiased_log.coef_[:, j]) dense_tfidf = filtered_tfidf.toarray() scale = np.linalg.norm(dense_tfidf.T, axis=0).reshape([-1, 1]) filtered_tfidf /= scale filtered_tfidf = sparse.csr_matrix(filtered_tfidf) return filtered_tfidf, mask
y = fm.defineClass(y) #控制选择的特征的数量的参数 cc = [0.1] score = [] print "======原始特征=========" print X.shape print "*" * 80 for c in cc: clf_l1_LR = LogisticRegression(C=c, penalty='l1', tol=0.001) clf_l1_LR.fit(X, y) coef = clf_l1_LR.coef_ print "=======LR model=========" print clf_l1_LR model = SelectFromModel(clf_l1_LR, prefit=True) feature_mask = model._get_support_mask() #获得特征选择的下标 new_mask = feature_mask.astype('float64') coef_img = nifti_masker.inverse_transform(new_mask) coef_img.to_filename('D:\sub001_L1.img') XX = model.transform(X) yy = y print "====新的特征=======" print XX.shape cv = StratifiedKFold(yy, n_folds) cv_scores = [] for train, test in cv: svc = SVC(kernel='linear') svc.fit(XX[train], yy[train]) prediction = svc.predict(XX[test])
y = fm.defineClass(y) #控制选择的特征的数量的参数 cc = [0.1] score = [] print "======原始特征=========" print X.shape print "*"*80 for c in cc: clf_l1_LR = LogisticRegression(C=c, penalty='l1', tol=0.001) clf_l1_LR.fit(X, y) coef = clf_l1_LR.coef_ print "=======LR model=========" print clf_l1_LR model = SelectFromModel(clf_l1_LR, prefit=True) feature_mask = model._get_support_mask() #获得特征选择的下标 new_mask = feature_mask.astype('float64') coef_img = nifti_masker.inverse_transform(new_mask) coef_img.to_filename('D:\sub001_L1.img') XX = model.transform(X) yy = y print "====新的特征=======" print XX.shape cv = StratifiedKFold(yy,n_folds) cv_scores = [] for train, test in cv: svc = SVC(kernel='linear') svc.fit(XX[train], yy[train]) prediction = svc.predict(XX[test])
from sklearn.linear_model import LassoCV # Load the boston dataset. boston = load_boston() X, y = boston['data'], boston['target'] # We use the base estimator LassoCV since the L1 norm promotes sparsity of features. clf = LassoCV() clf.fit(X,y) # Set a minimum threshold of 0.25 sfm = SelectFromModel(clf, threshold='mean',prefit=True) print X.shape #sfm = sfm.fit(X, y) print "============LassoCV================" print "选择的特征" print sfm._get_support_mask(); n_features = sfm.transform(X).shape[1] print n_features # We use LinearSVC from sklearn.svm import LinearSVC #C 越小,选择的特征越少 lsvc = LinearSVC(C=0.001, penalty="l1", dual=False) y = y.astype(np.int64) #转换成整数,因为是分类器,不是回归 lsvc.fit(X,y) model = SelectFromModel(lsvc, prefit=True) print "============线性SVM===============================" print "选择的特征" print model._get_support_mask(); n_features = model.transform(X).shape[1] print n_features