Пример #1
0
 def transform(self, X):
     if (self.method == 'sfm'):
         if (self.n_vars is None):
             return sk_fs.SelectFromModel(self.clf,
                                          prefit=True).transform(X)
         else:
             return sk_fs.SelectFromModel(self.clf,
                                          max_features=self.n_vars,
                                          threshold=-np.inf,
                                          prefit=True).transform(X)
     elif (self.method == 'rfo'):
         return X[self.X_columns]
     elif (self.method == 'rfs'):
         return X[self.X_columns]
     elif (self.method == 'PCA'):
         return self.clf.transform(X)
     elif (self.method == 'skb'):
         return self.clf.transform(X)
     elif (self.method == 'eq'):
         return X
     elif (self.method == 'eli5_rfe'):
         return self.clf.transform(X)
     elif (self.method == 'biofes'):
         var_sel = list(
             self.Tcan.Disc.sort_values(by='0-1',
                                        ascending=False).iloc[:].index)
         return X[var_sel[0:min(self.n_vars, len(var_sel))]]
Пример #2
0
    def _get_support_mask(self):
        if self._cache_support_mask is not None:
            return self._cache_support_mask
        if self.prefit:
            estimator = self.estimator
        elif hasattr(self, 'estimator_'):
            estimator = self.estimator_
        else:
            raise ValueError(
                'Either fit the model before transform or set "prefit=True"'
                ' while passing the fitted estimator to the constructor.')
        try:
            importances = getattr(estimator, "feature_importances_", None)
            if importances is not None and np.isnan(importances).all():
                mask = np.ones(importances.shape, bool)
            else:
                mask = super(_SelectFromModel, self)._get_support_mask()
        except ValueError:
            sfm = sk_fsel.SelectFromModel(
                estimator.estimator_, self.threshold, True
            )
            mask = sfm._get_support_mask()

        for i in self._out_mask:
            mask[i] = False

        for i in self._in_mask:
            mask[i] = True
        self._cache_support_mask = mask
        return mask
Пример #3
0
def get_thresh(model, train, test, label_test, label_train):
    if (len(test) > len(train)) or (len(label_test) > len(label_train)):
        raise TypeError('Invalid train and test size')
    model1 = XGBClassifier()
    if type(model) != type(XGBClassifier()):
        raise TypeError('Invalid model passed')
    if (pd.DataFrame(label_train).shape[1] >
            1) or (pd.DataFrame(label_test).shape[1] > 1):
        raise TypeError('Multiple columns in label, Invalid shape.')
    max_score = 0
    thrsh = 0
    thresholds = np.sort(model.feature_importances_)
    for thresh in thresholds:
        selection = feature_selection.SelectFromModel(model,
                                                      threshold=thresh,
                                                      prefit=True)
        select_X_train = selection.transform(train)
        selection_model = XGBClassifier()
        selection_model.fit(select_X_train, label_train)
        select_X_test = selection.transform(test)
        y_pred = selection_model.predict(select_X_test)
        scr = metrics.roc_auc_score(label_test, y_pred)
        if (scr > max_score):
            max_score = scr
            thrsh = thresh
    return thrsh
Пример #4
0
def get_important_features(estimator, X, threshold='median'):
    if isinstance(estimator, sklearn.linear_model.Lasso) :
        selected_features = X.columns[estimator.coef_!=0]
    else:
        tmp_model = feature_selection.SelectFromModel(estimator, prefit=True, threshold=threshold)
        selected_features = X.columns[tmp_model.get_support()]
    return selected_features 
    def select_optimal_set(self, num_jobs):
        ''' Finds the best set of features to use through cross validation.

    It performs a grid search cross validation through possible cost values that yields
    the highest performance. The selector object will be set to this optimal model.
    '''
        tuned_parameters = [{
            'C': [
                0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,
                50, 100, 500, 1000
            ]
        }]
        feature_selector_model = LinearSVC(penalty='l1',
                                           dual=False,
                                           max_iter=5000,
                                           tol=0.005,
                                           class_weight=self.penalty_weights)
        scorer = make_scorer(precision_score, pos_label=None, average='macro')
        # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory
        # usage does not blow up
        skf = StratifiedKFold(n_splits=5, shuffle=True)
        clf = GridSearchCV(estimator=feature_selector_model,
                           param_grid=tuned_parameters,
                           n_jobs=num_jobs,
                           pre_dispatch="n_jobs",
                           cv=skf,
                           scoring=scorer)
        clf.fit(self.features, self.labels)
        self.current_model = clf.best_estimator_
        self.current_transformer = feature_selection.SelectFromModel(
            clf.best_estimator_, prefit=True)
Пример #6
0
def FeatSelect(df, a, thresh, model_type):
    # a is the regularisation constant
    # thresh is the threshold with which we drop features with ridge coefficient below
    #model_type should be either "ridge" or "lasso"

    assert model_type == "ridge" or model_type == "lasso"

    #fit model
    copydf = df.copy()
    X = copydf.drop(["tna"], 1)
    t = copydf["tna"]
    if model_type == "ridge":
        reg = linear_model.Ridge(alpha=a)
    elif model_type == "lasso":
        reg = linear_model.Lasso(alpha=a)
    else:
        print(ERROR)
        return
    reg.fit(X, t)

    #drop unimportant features
    model = feature_selection.SelectFromModel(reg,
                                              threshold=thresh,
                                              prefit=True)
    copydf_new = model.transform(X)

    #Adding Column titles
    feature_names = np.array(X.columns)
    selected_features = feature_names[model.get_support()]
    copydf_new = pd.DataFrame(data=X, columns=selected_features)
    copydf_new["tna"] = t

    return copydf_new
Пример #7
0
def get_fs_model(model, method, train, target=None, cv=None):
    """Connects given model with specified feature selection method and trains
    the final structure.
    """
    if method == "RFE":
        model = fs_scikit.RFE(model, 2, step=5)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    if method == "RFECV":
        model = fs_scikit.RFECV(model, 3, cv=cv)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    elif method == "linearSVC":
        sel = SelectFromModel(LinearSVC(penalty='l1', dual=False))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "fromModel":
        fm = fs_scikit.SelectFromModel(model)
        if target is not None:
            fm.fit(train, target)
        else:
            fm.fit(train)
        model = Pipeline([('feature_selection', fm), ('data_mining', model)])

    # elif method == "Anova":
    # ANOVA SVM-C
    # anova_filter = fs_scikit.SelectKBest(f_regression, k=5)
    # model = Pipeline([
    #     ('feature_selection', anova_filter),
    #     ('data_mining', model)
    # ])
    elif method == "VarianceThreshold":
        sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8)))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectPercentile":
        sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFpr":
        sel = fs_scikit.SelectFpr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFdr":
        sel = fs_scikit.SelectFdr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFwe":
        sel = fs_scikit.SelectFwe(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "ch2":
        sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    else:
        print("Feature selection method was not found: " + method)
        sys.exit(1)
    return model
Пример #8
0
def feature_selection_from_model(estimator, feature_data, target):
    estimator.fit(feature_data, target)
    
    features = pd.DataFrame({'feature':feature_data.columns, 'importance':estimator.feature_importances_})
    features.sort_values(by=['importance'], ascending=True, inplace=True)
    features.set_index('feature', inplace=True)
    features.plot(kind='barh', figsize=(20, 20))

    fs_model = feature_selection.SelectFromModel(estimator, prefit=True)
    return features, fs_model.transform(feature_data)
Пример #9
0
def feature_select(X, y):
    import sklearn.ensemble as ensemble
    clf = ensemble.ExtraTreesClassifier(random_state=0)
    clf.fit(X, y)
    import sklearn.feature_selection as fselect
    model = fselect.SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    print("selected feature number:", X_new.shape)

    return X_new, model
Пример #10
0
def select_from_model_gbdt(arr0, target):
    from sklearn.ensemble import GradientBoostingClassifier
    matrix = np.array(arr0)
    target = np.array(target)
    temp = feature_selection.SelectFromModel(GradientBoostingClassifier()).fit(
        matrix, target)
    indx = temp._get_support_mask().tolist()
    scores = get_importance(temp.estimator_).tolist()
    # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index)
    result = temp.transform(matrix).tolist()
    return scores, indx, result
Пример #11
0
def select_from_model_lr(arr0, target):
    from sklearn.linear_model import LogisticRegression
    matrix = np.array(arr0)
    target = np.array(target)
    temp = feature_selection.SelectFromModel(
        LogisticRegression(penalty="l1", C=0.1)).fit(matrix, target)
    indx = temp._get_support_mask().tolist()
    scores = get_importance(temp.estimator_).tolist()
    # threthold = temp.threshold_
    # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index)
    result = temp.transform(matrix).tolist()
    return scores, indx, result
Пример #12
0
 def select_features(self, alpha=-1, threshold_q='mean'):
     fore_model = linear_model.RidgeCV(alphas=[alpha])
     fore_model.fit(self.train_X, np.ravel(np.array(self.train_Y), 1))
     sfm = feature_selection.SelectFromModel(fore_model,
                                             threshold=threshold_q,
                                             prefit=True)
     n_features = sfm.transform(self.train_X).shape[1]
     train_X_selected = pd.DataFrame(
         sfm.transform(self.train_X),
         index=self.train_X.index,
         columns=self.train_X.columns[sfm.get_support()])
     return train_X_selected, sfm
Пример #13
0
def select_feature_by_L1(data_train, data_test):
    all_cols = [
        'sex', 'length', 'diameter', 'height', 'whole weight',
        'shucked weight', 'viscera weight', 'shell weight'
    ]
    Y = data_train['rings']
    X = data_train[all_cols]
    X_test = data_test[all_cols]

    svc = svm.LinearSVC(penalty='l1', dual=False).fit(X, Y)
    model = fs.SelectFromModel(svc, threshold=0.5, prefit=True)
    return (model.transform(X), model.transform(X_test))
Пример #14
0
def lars():
    behavior_data, conn_data = pu.load_data_full_subjects()
    conn_data.astype(float)

    categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex']
    categorical_data = behavior_data[categorical_variables]
    dummy_coded_categorical = pu.dummy_code_binary(categorical_data)
    covariate_data = pd.concat([behavior_data['age'], dummy_coded_categorical], axis=1)

    ml_data = pd.concat([conn_data, covariate_data], axis=1)
    target = behavior_data['distress_TQ'].values.astype(float)

    feature_names = list(ml_data)
    continuous_features = [f for f in feature_names if 'categorical' not in f]
    continuous_indices = [ml_data.columns.get_loc(cont) for cont in continuous_features]

    categorical_features = [f for f in feature_names if 'categorical' in f]
    categorical_indices = [ml_data.columns.get_loc(cat) for cat in categorical_features]

    ml_continuous = ml_data.values[:, continuous_indices]
    ml_categorical = ml_data.values[:, categorical_indices]

    # Standardization for continuous data
    preproc = preprocessing.StandardScaler().fit(ml_continuous)
    ml_z = preproc.transform(ml_continuous)

    # Variance threshold for categorical data
    varthresh = feature_selection.VarianceThreshold(threshold=0).fit(ml_categorical)
    ml_v = varthresh.transform(ml_categorical)

    ml_preprocessed = np.hstack((ml_z, ml_v))

    # Feature selection with extra trees
    clf = ensemble.ExtraTreesRegressor()
    model = feature_selection.SelectFromModel(clf, threshold="2*mean")
    # Transform train and test data with feature selection model
    ml_cleaned = model.fit_transform(ml_preprocessed, target)
    feature_indices = model.get_support(indices=True)
    cleaned_features = [feature_names[i] for i in feature_indices]

    lars_classifier = linear_model.LarsCV(cv=3, normalize=False, fit_intercept=False)

    lars_classifier.fit(ml_cleaned, target)
    predicted = lars_classifier.predict(ml_cleaned)

    r2 = lars_classifier.score(ml_cleaned, target)

    exp_var = metrics.explained_variance_score(target, predicted)
    max_err = metrics.max_error(target, predicted)
    mae = metrics.mean_absolute_error(target, predicted)
    mse = metrics.mean_squared_error(target, predicted)
    print(r2)
Пример #15
0
def process(df):
    pipeline = Pipeline(
        memory=None,
        steps=[
            ("standardize", StandardScaler()),
            ("feat_select", ftselect.SelectFromModel(RandomForestRegressor())),
            (
                "regressor",
                # ElasticNetCV(cv=len(df.index), max_iter=1000, normalize=False),
                LassoCV(cv=len(df.index), max_iter=3000, normalize=False),
            ),
        ],
    )

    return pipeline
def feature_selection_trees(trainX, trainY, testX, testY):
    """
    Calculate the feature importance and select the most importance features
    It return the filtered training and testing sets
    """
    ## Feature selection
    clf = ensemble.ExtraTreesClassifier(random_state=1729,
                                        n_estimators=250,
                                        n_jobs=-1)
    selector = clf.fit(trainX, trainY)

    fs = feature_selection.SelectFromModel(selector, prefit=True)
    trainX = fs.transform(trainX)
    testX = fs.transform(testX)

    return trainX, testX
Пример #17
0
 def fit(self, on_engine, velocities, accelerations, *args):
     if on_engine.all():
         self.base = self.model = DefaultStartStopModel()
     else:
         X = np.column_stack((velocities, accelerations) + args)
         model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=4)
         self.model = sk_pip.Pipeline([('feature_selection',
                                        sk_fsel.SelectFromModel(model)),
                                       ('classification', model)])
         self.model.fit(X, on_engine)
         model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=3)
         self.base = sk_pip.Pipeline([
             ('feature_selection',
              sk_prep.FunctionTransformer(lambda X: X[:, :2])),
             ('classification', model)
         ])
         self.base.fit(X, on_engine)
     return self
def FeatSelect(df,k,m,tna_predict,thresh):
    #m is the dimension of feature expansion
    copydf = df.copy()
    old_df = copydf #This exists for us to extract values from later
    X=copydf.drop(["class"], 1)
    t=copydf["class"]
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X, t)
    #drop unimportant features for each treshold tested
    model = feature_selection.SelectFromModel(clf, threshold=thresh, prefit=True)
    copydf_new = model.transform(X)
    #Adding Column titles
    feature_names = np.array(X.columns)
    selected_features = feature_names[model.get_support()]
    copydf_new = pd.DataFrame(data= X, columns= selected_features)
    copydf_new["class"] = t
    #Test new model
    return [old_df,copydf_new]
Пример #19
0
def TNAregression(train_df,test_df):
    #Gives predicted TNA values using optimised ridge regression
    
    #Fixing training data
    real_probeA = uncorrupt(train_df)
    FE2_probeA = FE_without_tna(real_probeA,2)
    stdprobe2A = standardisation_without_tna(FE2_probeA)
    stdprobe2A["ones"] = 1
    copydf = stdprobe2A.copy()
    
    #Building regression model using known optimal hyper parameters
    X = copydf.drop(["tna"], 1)
    t = copydf["tna"]
    reg = linear_model.Ridge(alpha = 0.0016)
    reg.fit(X,t)
    
    #Feature selection
    model = feature_selection.SelectFromModel(reg, threshold=8.5, prefit=True)
    copydf_new = model.transform(X)
    
    #Adding Column titles for feature-selected dataframe
    feature_names = np.array(X.columns)
    selected_features = feature_names[model.get_support()]
    copydf_new = pd.DataFrame(data= X, columns= selected_features)
    copydf_new["tna"] = t
    
    #Rebuilding model for feature-selected dataframe
    X_new = copydf_new.drop(["tna"], 1)
    t_new = copydf_new["tna"]
    reg_new = linear_model.Ridge(alpha = 0.0016)
    reg_new.fit (X_new,t_new)
    
    #Fixing test data
    real_probeB = uncorrupt(test_df)
    FE2_probeB = FE_without_tna(real_probeB,2)
    stdprobe2B = standardisation_without_tna(FE2_probeB)
    stdprobe2B["ones"] = 1
    copydf_test = stdprobe2B[X_new.columns]
    
    #tna_predict is column of tna predictions using optimal regression model
    tna_predict = reg_new.predict(copydf_test)
    tna_predict = pd.DataFrame(data=tna_predict) #converting to pandas DF to export
    
    return tna_predict
def SplitTrainTest_GetFeat_RF(df, method_ImpFeature_select):
    '''
    Splits data into X's used for training and testing, and Y's used for the same. Then uses Random Forest classifier to sort important features from most to
    least important, using default metric in RF (i.e. mean decre. in impurity of feature)
    :param df: feature table data frame
    :param test_size: number between 0 and 1; % of entire data used as testing set
    :param method_ImpFeature_select: method of calculating feature importance scores;  string: can be 'mean_decrease_in_impurity' or 'permutation_importance'
    :return: a pandas series with index as names of features ordered from most to least important, and 1 column of their relative importance scores
    '''
    y = df.iloc[:, 0]  # subset truelabel column (response)
    x = df[list(df.iloc[:, 1:df.shape[1]])]
    feat_imp = []

    start = timer()
    clf_RF = RandomForestClassifier(bootstrap=True,
                                    n_estimators=500,
                                    n_jobs=2,
                                    oob_score=True,
                                    class_weight='balanced',
                                    criterion='gini',
                                    max_depth=60,
                                    max_features='auto',
                                    max_leaf_nodes=None,
                                    min_samples_leaf=2,
                                    min_samples_split=2,
                                    min_weight_fraction_leaf=0.0,
                                    random_state=0,
                                    verbose=1,
                                    warm_start=False)
    sfm = feature_selection.SelectFromModel(
        clf_RF, threshold='mean'
    )  # threshold of feature picking is the MEAN importance score # CAN CHANGE
    sfm.fit(x, y)  # fit RF classifier
    x_sfm = sfm.transform(x)  # pulling the important features only
    end = timer()

    # put together true label column + relevant features (from using SelectFromModel above)
    df_x_sfm = pd.DataFrame(x_sfm, columns=x.columns[sfm.get_support()])
    n_features = x_sfm.shape[1]
    df_forSVM = pd.concat([y, df_x_sfm], axis=1)
    for index in sfm.get_support(indices=True):
        feat_imp.append(x.columns[index])

    return [df_forSVM, n_features, end - start, feat_imp]
Пример #21
0
def train_drfs(train_x, train_y, eps=0.5, threshold="median"):
    n_samples, n_features, n_classes = \
            get_counts_tt(train_x, train_y)

    # pick number of components
    min_comp = random_projection.johnson_lindenstrauss_min_dim( \
            n_samples=n_samples, eps=eps)
    min_comp = min(min_comp, n_features)

    # scale and agglomerate to min_comp
    #scaler = preprocessing.StandardScaler()
    scaler = preprocessing.QuantileTransformer()
    feat_agg = cluster.FeatureAgglomeration( \
            n_clusters=min_comp)
    xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
    scaler2 = preprocessing.RobustScaler()
    #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True)

    # train the model pipeline
    dr_pipe = pipeline.Pipeline([('scaler', scaler), \
            ('feat_agg', feat_agg), ('scaler2', scaler2)])

    dr_pipe.fit(train_x)

    # transform train_x to train xtc
    train_x = dr_pipe.transform(train_x)
    # train the xtc
    xtc.fit(train_x, train_y)

    print("Feature importances:")
    print("\tMax:", max(xtc.feature_importances_))
    print("\tMin:", min(xtc.feature_importances_))
    #print(xtc.feature_importances_)

    # create the feature selection model from the xtc
    feat_sel = feature_selection.SelectFromModel( \
            xtc, prefit=True, threshold=threshold)

    # create the pipeline to reduce dim then feature select
    drfs_pipe = pipeline.Pipeline(\
            [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)])

    return drfs_pipe
def feature_selection_without_covariates(x_train, x_test, y_train,
                                         feature_names):
    # Standardization for continuous data
    preproc = preprocessing.StandardScaler().fit(x_train)
    x_train_z = preproc.transform(x_train)
    x_test_z = preproc.transform(x_test)

    # Feature selection with extra trees
    extra_tree_fs = ensemble.ExtraTreesClassifier(random_state=seed)
    feature_model = feature_selection.SelectFromModel(extra_tree_fs,
                                                      threshold="2*mean")

    # Transform train and test data with feature selection model
    x_train_feature_selected = feature_model.fit_transform(x_train_z, y_train)
    x_test_feature_selected = feature_model.transform(x_test_z)
    feature_indices = feature_model.get_support(indices=True)
    cleaned_features = [feature_names[i] for i in feature_indices]

    return x_train_feature_selected, x_test_feature_selected, cleaned_features
Пример #23
0
def train():
    data = pd.read_csv(args.datadir, names=['hash', 'entropy', 'y'])
    X = data.drop(['hash', 'y'], axis=1).values
    y = data['y'].values

    # Feature selection using Trees Classifier
    fsel = ske.ExtraTreesClassifier().fit(X, y)
    model = feature_selection.SelectFromModel(fsel, prefit=True)
    X_new = model.transform(X)
    nb_features = X_new.shape[1]

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X_new, y, test_size=0.2)

    features = []
    # XXX : take care of the feature order
    for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
        features.append(data.columns[2 + f])

    #Algorithm comparison
    algorithms = {
        "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
        "RandomForest": ske.RandomForestClassifier(n_estimators=10),
        "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=10),
        "AdaBoost": ske.AdaBoostClassifier(n_estimators=10),
        "GNB": GaussianNB()
    }

    results = {}
    for algo in algorithms:
        clf = algorithms[algo]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print("%s : %f %%" % (algo, score * 100))
        results[algo] = score

    #choose best algorithm
    winner = max(results, key=results.get)

    #save model and features
    joblib.dump(algorithms[winner], os.path.join(args.output,
                                                 'classifier.pkl'))
Пример #24
0
def get_features(X, y, fsm):
    if fsm == '1':
        # SelectKBest
        kBest = math.ceil(len(X[0]) / 2)
        feature_scores = f_selection.SelectKBest(chi2,
                                                 k=kBest).fit_transform(X, y)
        return feature_scores
    elif fsm == '2':
        # VarianceThreshold
        feature_scores = f_selection.VarianceThreshold(
            threshold=vThreshold).fit_transform(X)
        return feature_scores
    elif fsm == '3':
        # SelectFromModel
        clf = ExtraTreesClassifier(random_state=200)
        clf = clf.fit(X, y)
        model = f_selection.SelectFromModel(clf).fit(X, y)
        feature_scores = model.transform(X)
        return feature_scores
    else:
        raise ValueError("invalid fsm")
Пример #25
0
    def modelSelector(self, model_name="rf", inplace=True):
        """
        :method_name {"rf", "lasso"}
        """
        print("Feature selecting method: ", model_name)
        selector = {
            "rf": ensemble.RandomForestClassifier(n_estimators=10),
            "lasso": linear_model.LassoCV(cv=5, max_iter=5000)
        }
        model = selector[model_name]
        sler = feature_selection.SelectFromModel(model)
        sler.fit(self.x, self.y)
        self.indexs = sler.get_support()

        if inplace:
            self.x = self.x[:, self.indexs]
            self.n = self.x.shape[1]
            self.featureNames = self.featureNames[self.indexs]
            return self.x, self.y
        else:
            return self.x[:, self.indexs], self.y
def train_secondary(X_train, Y_train):

    clf = Pipeline(steps=[('pca',
                           feature_selection.SelectFromModel(DecisionTreeClassifier(),
                                                             threshold=0.05)),
                           ('dt', DecisionTreeClassifier())])

    params = dict(
                  dt__max_depth=np.arange(5, 105, 20),
                  )

    best_clf = GridSearchCV(clf, params, n_jobs=16, scoring='roc_auc',
                            verbose=0, cv=3)

    best_clf.fit(X_train, Y_train)

    joblib.dump(best_clf.best_estimator_, './best_secondary_model.pkl')

    print best_clf.best_score_
    print best_clf.best_params_

    return best_clf.best_estimator_
def feature_selection_with_covariates(x_train, x_test, y_train,
                                      continuous_indices, categorical_indices,
                                      feature_names):
    # Split data for continuous, categorical preprocessing
    x_train_cont, x_test_cont = x_train[:,
                                        continuous_indices], x_test[:,
                                                                    continuous_indices]
    x_train_cat, x_test_cat = x_train[:,
                                      categorical_indices], x_test[:,
                                                                   categorical_indices]

    # Standardization for continuous data
    preproc = preprocessing.StandardScaler().fit(x_train_cont)
    x_train_z = preproc.transform(x_train_cont)
    x_test_z = preproc.transform(x_test_cont)

    # Variance threshold for categorical data
    varthresh = feature_selection.VarianceThreshold(
        threshold=0).fit(x_train_cat)
    x_train_v = varthresh.transform(x_train_cat)
    x_test_v = varthresh.transform(x_test_cat)

    x_train_data = np.hstack((x_train_z, x_train_v))
    x_test_data = np.hstack((x_test_z, x_test_v))

    # Feature selection with extra trees
    extra_tree_fs = ensemble.ExtraTreesClassifier(random_state=seed)
    feature_model = feature_selection.SelectFromModel(extra_tree_fs,
                                                      threshold="2*mean")

    # Transform train and test data with feature selection model
    x_train_feature_selected = feature_model.fit_transform(
        x_train_data, y_train)
    x_test_feature_selected = feature_model.transform(x_test_data)
    feature_indices = feature_model.get_support(indices=True)
    cleaned_features = [feature_names[i] for i in feature_indices]

    return x_train_feature_selected, x_test_feature_selected, cleaned_features
Пример #28
0
def FeatSelectDT(df,thresh):
    
    #thresh is the threshold hyper parameter
    
    copydf = df.copy()
    
    #Building decision tree model
    X=copydf.drop(["class"], 1)
    t=copydf["class"]
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X, t)
    
    #Drop unimportant features below the treshold hyper parameter
    model = feature_selection.SelectFromModel(clf, threshold=thresh, prefit=True)
    copydf_new = model.transform(X)
    
    #Adding Column titles
    feature_names = np.array(X.columns)
    selected_features = feature_names[model.get_support()]
    copydf_new = pd.DataFrame(data= X, columns= selected_features)
    copydf_new["class"] = t
    
    return copydf_new
Пример #29
0
    def _get_support_mask(self):
        try:
            mask = super(_SelectFromModel, self)._get_support_mask()
        except ValueError:
            # SelectFromModel can directly call on transform.
            if self.prefit:
                estimator = self.estimator
            elif hasattr(self, 'estimator_'):
                estimator = self.estimator_
            else:
                raise ValueError(
                    'Either fit the model before transform or set "prefit=True"'
                    ' while passing the fitted estimator to the constructor.')
            sfm = sk_fsel.SelectFromModel(estimator.estimator_, self.threshold,
                                          True)
            mask = sfm._get_support_mask()

        for i in self._out_mask:
            mask[i] = False

        for i in self._in_mask:
            mask[i] = True

        return mask
Пример #30
0
def select_features(
    X_train: pd.DataFrame,
    X_valid: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    parameters: dict,
    model=None,
) -> list:
    """Extracts the most relevent features from the sample.

    Args:
        X_train: training data.

        X_valid: validation data.

        X_test: test data.

        parameters: parameters defined in parameters.yml.

    Returns:
        A dictionary containing:
            The reduced training, validation and test data.
            Lists of included and excluded variables.
    """

    log = logging.getLogger(__name__)
    paras = parameters["features"]["selection"]

    if paras["skip"]:
        log.warning(red("Skipping feature selection."))
        pause()
        return dict(
            X_train_reduced=X_train,
            X_valid_reduced=X_valid,
            X_test_reduced=X_test,
            included=X_train.columns,
            excluded=[],
        )

    # Get feature names
    features = list(X_train.columns)

    if paras["type"] == "model":
        # Instantiate the selector and fit it to the training data
        lsvc = LinearSVC(**paras["model"]).fit(X_train, y_train)
        selector = fs.SelectFromModel(lsvc, prefit=True)
    elif paras["type"] == "k_best":
        selector = fs.SelectKBest(fs.mutual_info_classif,
                                  k=paras["k_best"]["k"])
        selector.fit(X_train, y_train)

    # Get feature mask
    mask = selector.get_support()
    included = list(compress(features, mask))
    excluded = list(set(features) - set(included))
    log.info(blue("Included {} variables: {}".format(len(included), included)))
    log.info(blue("Excluded {} variables: {}".format(len(excluded), excluded)))

    # Transform the datasets using the fitted selector
    X_train_reduced = selector.transform(X_train)
    X_valid_reduced = selector.transform(X_valid)
    X_test_reduced = selector.transform(X_test)

    return dict(
        X_train_reduced=X_train_reduced,
        X_valid_reduced=X_valid_reduced,
        X_test_reduced=X_test_reduced,
        included=included,
        excluded=excluded,
    )