def run():
    allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove =
        ["KWH", "KWHSPH", "KWHCOL", "KWHWTH", "KWHRFG", "KWHOTH", "BTUEL", "BTUELSPH", "BTUELCOL", "BTUELWTH", "BTUELRFG","BTUELOTH",
        "DOLLAREL", "DOLELSPH", "DOLELCOL", "DOLELWTH", "DOLELRFG", "DOLELOTH", "TOTALBTUOTH", "TOTALBTUCOL", 'TOTALBTU', 'TOTALBTUWTH',
         'TOTALBTU', 'TOTALBTUSPH', 'TOTALBTURFG', 'TOTALDOL', 'TOTALDOLSPH', 'TOTALDOLCOL', 'TOTALDOLWTH', 'TOTALDOLRFG', 'TOTALDOLOTH'])
    #allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = [], forceUse =
    #                         [
    #                           'WGTP', 'NP', 'TYPE', 'ACR', 'BDSP', 'BATH', 'FS','MHP', 'RMSP', 'RNTP', 'REFR', 'RNTP', 'RWAT', 'STOV', 'TEN', 'VALP', 'YBL', 'FES', 'FINCP', 'HINCP', 'HHT', 'KIT', 'NOC', 'NPF', 'PLM', 'SRNT', 'SVAL', 'TAXP', 'WIF', 'WORKSTAT',
    #                         ])

    clf = RandomForestRegressor(n_estimators = 100, n_jobs = 7)
    clf.fit(X, y)

    model = SelectFromModel(clf, prefit = True)
    X = model.transform(X)

    relevantFeatures = [allKeys[i] for i in range(len(model._get_support_mask())) if model._get_support_mask()[i] == True]
    print("Relevant Features", relevantFeatures)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)


    clf.fit(X_train, y_train)
    print(y_test[:100])
    print(metrics.mean_squared_error(clf.predict(X_test), y_test))
    features = sorted(zip(allKeys, clf.feature_importances_), key = lambda x : x[1], reverse = True)
    print("Features", features)
Exemplo n.º 2
0
def test_sample_weight():
    # Ensure sample weights are passed to underlying estimator
    X, y = datasets.make_classification(
        n_samples=100,
        n_features=10,
        n_informative=3,
        n_redundant=0,
        n_repeated=0,
        shuffle=False,
        random_state=0,
    )

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    est = LogisticRegression(random_state=0, fit_intercept=False)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(X, y, sample_weight=None)
    mask = transformer._get_support_mask()
    transformer.fit(X, y, sample_weight=sample_weight)
    weighted_mask = transformer._get_support_mask()
    assert not np.all(weighted_mask == mask)
    transformer.fit(X, y, sample_weight=3 * sample_weight)
    reweighted_mask = transformer._get_support_mask()
    assert np.all(weighted_mask == reweighted_mask)
Exemplo n.º 3
0
    def _get_support_mask(self):
        if self._cache_support_mask is not None:
            return self._cache_support_mask
        if self.prefit:
            estimator = self.estimator
        elif hasattr(self, 'estimator_'):
            estimator = self.estimator_
        else:
            raise ValueError(
                'Either fit the model before transform or set "prefit=True"'
                ' while passing the fitted estimator to the constructor.')
        try:
            with np.errstate(divide='ignore', invalid='ignore'):
                importances = getattr(estimator, "feature_importances_", None)
            if importances is not None and np.isnan(importances).all():
                mask = np.ones(importances.shape, bool)
            else:
                mask = super(_SelectFromModel, self)._get_support_mask()
        except ValueError:
            sfm = SelectFromModel(estimator.estimator_, self.threshold, True)
            mask = sfm._get_support_mask()

        for i in self._out_mask:
            mask[i] = False

        for i in self._in_mask:
            mask[i] = True
        self._cache_support_mask = mask
        return mask
Exemplo n.º 4
0
class SelectFromLinearSVC(AbstractFeatureSelector):

    param_distributions = {
        'threshold': (1e-5,),
        'C': [float(x) for x in np.logspace(-2, 5, 100)]
    }

    def __init__(self, threshold=None, penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, fit_intercept=True, random_state=None, max_iter=1000):
        self.threshold = threshold
        self.penalty = penalty
        self.loss = loss
        self.dual = dual
        self.tol = tol
        self.C = C
        self.fit_intercept = fit_intercept
        self.random_state = random_state
        self.max_iter = max_iter

    def fit(self, X, y):
        self.linear_svc = LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol,
                                    fit_intercept=self.fit_intercept, random_state=self.random_state,
                                    max_iter=self.max_iter)
        self.linear_svc.fit(X, y)
        self.select_from_model = SelectFromModel(self.linear_svc, threshold=self.threshold, prefit=True)
        return self

    def _get_support_mask(self):
        return self.select_from_model._get_support_mask()
Exemplo n.º 5
0
class SelectFromLasso(AbstractFeatureSelector):

    param_distributions = {
        'threshold': (1e-5,),
        'alpha': [float(x) for x in np.logspace(-5, 2, 100)]
    }

    def __init__(self, threshold=None, alpha=1.0, fit_intercept=True, normalize=False, max_iter=1000, tol=0.0001, positive=False, selection='cyclic', random_state=None):
        self.threshold = threshold
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.max_iter = max_iter
        self.tol = tol
        self.positive = positive
        self.selection = selection
        self.random_state = random_state

    def fit(self, X, y):
        # NOTE: y is an ndarray of strings
        self.lasso = Lasso(alpha=self.alpha, fit_intercept=self.fit_intercept, normalize=self.normalize,
                           max_iter=self.max_iter, tol=self.tol, positive=self.positive, selection=self.selection,
                           random_state=self.random_state)
        self.lasso.fit(X, y)
        self.select_from_model = SelectFromModel(self.lasso, threshold=self.threshold, prefit=True)
        return self

    def _get_support_mask(self):
        return self.select_from_model._get_support_mask()
Exemplo n.º 6
0
def test_sample_weight():
    # Ensure sample weights are passed to underlying estimator
    X, y = datasets.make_classification(
        n_samples=100, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    est = LogisticRegression(random_state=0, fit_intercept=False)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(X, y, sample_weight=None)
    mask = transformer._get_support_mask()
    transformer.fit(X, y, sample_weight=sample_weight)
    weighted_mask = transformer._get_support_mask()
    assert not np.all(weighted_mask == mask)
    transformer.fit(X, y, sample_weight=3 * sample_weight)
    reweighted_mask = transformer._get_support_mask()
    assert np.all(weighted_mask == reweighted_mask)
Exemplo n.º 7
0
	def test_filter_prefit(self):
		regressor = DecisionTreeRegressor()
		regressor.fit(numpy.array([[0, 1], [0, 2], [0, 3]]), numpy.array([0.5, 1.0, 1.5]))
		selector = SelectFromModel(regressor, prefit = True)
		self.assertTrue(hasattr(selector, "estimator"))
		self.assertFalse(hasattr(selector, "estimator_"))
		selector = _filter_steps([("selector", selector, {})])[0][1]
		self.assertIsInstance(selector, SelectorProxy)
		self.assertIsInstance(selector.estimator, EstimatorProxy)
		self.assertFalse(hasattr(selector, "estimator_"))
		self.assertEqual([0, 1], selector._get_support_mask().tolist())
Exemplo n.º 8
0
def test_max_features_tiebreak():
    # Test if max_features can break tie among feature importance
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    max_features = X.shape[1]

    feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
    for n_features in range(1, max_features + 1):
        transformer = SelectFromModel(
            FixedImportanceEstimator(feature_importances),
            max_features=n_features,
            threshold=-np.inf)
        X_new = transformer.fit_transform(X, y)
        selected_feature_indices = np.where(transformer._get_support_mask())[0]
        assert_array_equal(selected_feature_indices, np.arange(n_features))
        assert X_new.shape[1] == n_features
Exemplo n.º 9
0
    def _get_support_mask(self):
        try:
            mask = super(_SelectFromModel, self)._get_support_mask()
        except ValueError:
            # SelectFromModel can directly call on transform.
            if self.prefit:
                estimator = self.estimator
            elif hasattr(self, 'estimator_'):
                estimator = self.estimator_
            else:
                raise ValueError(
                    'Either fit the model before transform or set "prefit=True"'
                    ' while passing the fitted estimator to the constructor.')
            sfm = SelectFromModel(estimator.estimator_, self.threshold, True)
            mask = sfm._get_support_mask()

        for i in self._out_mask:
            mask[i] = False

        for i in self._in_mask:
            mask[i] = True

        return mask
Exemplo n.º 10
0
def check_max_features(est, X, y):
    X = X.copy()
    max_features = X.shape[1]

    check_valid_max_features(est, X, y)
    check_invalid_max_features(est, X, y)

    transformer1 = SelectFromModel(estimator=est, max_features='all')
    transformer2 = SelectFromModel(estimator=est,
                                   max_features=max_features)
    X_new1 = transformer1.fit_transform(X, y)
    X_new2 = transformer2.fit_transform(X, y)
    assert_array_equal(X_new1, X_new2)

    # Test max_features against actual model.

    transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025))
    X_new1 = transformer1.fit_transform(X, y)
    for n_features in range(1, X_new1.shape[1] + 1):
        transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025),
                                       max_features=n_features)
        X_new2 = transformer2.fit_transform(X, y)
        assert_array_equal(X_new1[:, :n_features], X_new2)
        assert_array_equal(transformer1.estimator_.coef_,
                           transformer2.estimator_.coef_)

    # Test if max_features can break tie among feature importance

    feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
    for n_features in range(1, max_features + 1):
        transformer = SelectFromModel(
            FixedImportanceEstimator(feature_importances),
            max_features=n_features)
        X_new = transformer.fit_transform(X, y)
        selected_feature_indices = np.where(transformer._get_support_mask())[0]
        assert_array_equal(selected_feature_indices, np.arange(n_features))
        assert_equal(X_new.shape[1], n_features)
Exemplo n.º 11
0
Arquivo: FS.py Projeto: GitGudK/FS2
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

# Build a forest using all features and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
forest.fit(X_train, y_train)
# Calculate model performance based on all features
y_pred, y_prob = forest.predict(X_test), forest.predict_proba(X_test)
ScoreAll = roc_auc_score(y_test, y_prob[:, 1])

# Select certain features based on ExtraTrees importances
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
model = SelectFromModel(forest, prefit=True)
X_test_single = model.transform(X)

# Calculate model performance based on subset of features from single run
y_pred, y_prob = forest.predict(X_test), forest.predict_proba(X_test)
roc_auc_score(y_test, y_prob[:, 1])

# Set the initial global ratings
global_ratings = 1000 * np.ones(X.shape[1])

X_mask = model._get_support_mask()
global_ratings[X_mask] += 1
global_ratings[~X_mask] -= 1
Exemplo n.º 12
0
def feature_selection(tfidf: np.array,
                      titles: np.array,
                      inv_reg: float = 3,
                      random_state: int = 1,
                      name='nyc',
                      penalty: str = 'l1',
                      bootstrap: int = 0,
                      proportion=0.8,
                      refresh=False) -> tuple:
    """

    :param tfidf: A numpy array containing 'TFIDF' info
    :param titles: A list of titles
    :param inv_reg: the inverse regularization coefficient
    :param random_state: A random seed that starts for the logistic regression
    :param name: the name of the dataset (used to name the pickle file)
    :param penalty: The type of penalty to do features selection eg: 'l1' or 'elasticnet'
    :param bootstrap: The number of times to bootstrap, or 0 to not do bootstrapping
    :param refresh: choose whether to refresh the cache.
    :param proportion: Proportion of models the feature must appear in before putting it into our model
    :return: A list of words to ignore
    """
    l1_ratio = L1RATIO if penalty == 'elasticnet' else None
    if not bootstrap:
        logreg = LogisticRegression(random_state=random_state,
                                    penalty=penalty,
                                    C=inv_reg,
                                    solver='saga',
                                    multi_class="multinomial",
                                    n_jobs=4,
                                    max_iter=3000,
                                    l1_ratio=l1_ratio)
        logreg.fit(tfidf, titles)
        print(f'The accuracy of the model was {logreg.score(tfidf, titles)}')
        model = SelectFromModel(logreg, prefit=True)
        mask = model._get_support_mask()
    else:
        dump_path = f"temp/cache/bootstrap_{name}_{penalty}_{inv_reg if isinstance(inv_reg, int) else 'r' + str(int(100 / inv_reg))}_{bootstrap}"
        if not refresh and os.path.exists(dump_path):
            masks = load(dump_path)
        else:
            masks = []
            temp_logreg = LogisticRegression(penalty=penalty,
                                             C=inv_reg,
                                             solver='saga',
                                             multi_class="multinomial",
                                             n_jobs=4,
                                             max_iter=700,
                                             l1_ratio=l1_ratio)
            for i in range(bootstrap):
                temp_tfidf, temp_titles = resample(tfidf,
                                                   titles,
                                                   random_state=i)
                temp_logreg.random_state = random_state + i
                temp_logreg.fit(temp_tfidf, temp_titles)
                print(
                    f'The accuracy of model {i + 1} was {temp_logreg.score(tfidf, titles)}'
                )
                model = SelectFromModel(temp_logreg, prefit=True)
                masks.append(model._get_support_mask())
            with open(dump_path, 'wb') as f:
                dill.dump(masks, f)
        mask = combine_masks(masks, proportion)
    print(
        f'Number of features has been reduced from {tfidf.shape[1]} to {tfidf[:, mask].shape[1]}'
    )
    filtered_tfidf = tfidf[:, mask]
    debiased_log = LogisticRegression(random_state=random_state,
                                      penalty=penalty,
                                      C=inv_reg,
                                      solver='saga',
                                      multi_class="multinomial",
                                      n_jobs=4,
                                      max_iter=700,
                                      l1_ratio=l1_ratio)
    debiased_log.fit(filtered_tfidf, titles)
    for j in range(filtered_tfidf.shape[1]):
        filtered_tfidf[:, j] *= np.linalg.norm(debiased_log.coef_[:, j])
    dense_tfidf = filtered_tfidf.toarray()
    scale = np.linalg.norm(dense_tfidf.T, axis=0).reshape([-1, 1])
    filtered_tfidf /= scale
    filtered_tfidf = sparse.csr_matrix(filtered_tfidf)
    return filtered_tfidf, mask
Exemplo n.º 13
0
y = fm.defineClass(y)
#控制选择的特征的数量的参数
cc = [0.1]
score = []
print "======原始特征========="
print X.shape
print "*" * 80
for c in cc:

    clf_l1_LR = LogisticRegression(C=c, penalty='l1', tol=0.001)
    clf_l1_LR.fit(X, y)
    coef = clf_l1_LR.coef_
    print "=======LR model========="
    print clf_l1_LR
    model = SelectFromModel(clf_l1_LR, prefit=True)
    feature_mask = model._get_support_mask()  #获得特征选择的下标
    new_mask = feature_mask.astype('float64')
    coef_img = nifti_masker.inverse_transform(new_mask)
    coef_img.to_filename('D:\sub001_L1.img')

    XX = model.transform(X)
    yy = y
    print "====新的特征======="
    print XX.shape
    cv = StratifiedKFold(yy, n_folds)
    cv_scores = []

    for train, test in cv:
        svc = SVC(kernel='linear')
        svc.fit(XX[train], yy[train])
        prediction = svc.predict(XX[test])
Exemplo n.º 14
0
y = fm.defineClass(y)
#控制选择的特征的数量的参数
cc = [0.1]
score = []
print "======原始特征========="
print X.shape
print "*"*80
for c in cc:
    
    clf_l1_LR = LogisticRegression(C=c, penalty='l1', tol=0.001)
    clf_l1_LR.fit(X, y)
    coef = clf_l1_LR.coef_
    print "=======LR model========="
    print clf_l1_LR
    model = SelectFromModel(clf_l1_LR, prefit=True)
    feature_mask = model._get_support_mask() #获得特征选择的下标
    new_mask = feature_mask.astype('float64')
    coef_img = nifti_masker.inverse_transform(new_mask)
    coef_img.to_filename('D:\sub001_L1.img')

    XX = model.transform(X)
    yy = y
    print "====新的特征======="
    print XX.shape
    cv = StratifiedKFold(yy,n_folds)
    cv_scores = []

    for train, test in cv:
        svc = SVC(kernel='linear')
        svc.fit(XX[train], yy[train])
        prediction = svc.predict(XX[test])
Exemplo n.º 15
0
from sklearn.linear_model import LassoCV

# Load the boston dataset.
boston = load_boston()
X, y = boston['data'], boston['target']

# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV()
clf.fit(X,y)
# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold='mean',prefit=True)
print X.shape
#sfm = sfm.fit(X, y)
print "============LassoCV================"
print "选择的特征"
print sfm._get_support_mask();
n_features = sfm.transform(X).shape[1]
print n_features

# We use LinearSVC
from sklearn.svm import LinearSVC
#C 越小,选择的特征越少
lsvc = LinearSVC(C=0.001, penalty="l1", dual=False)
y = y.astype(np.int64) #转换成整数,因为是分类器,不是回归
lsvc.fit(X,y)
model = SelectFromModel(lsvc, prefit=True)
print "============线性SVM==============================="
print "选择的特征"
print model._get_support_mask();
n_features = model.transform(X).shape[1]
print n_features