示例#1
0
def fsel(bcl, X, d, m, forward=True, floating=False, cv=0, show=0):
    if show > 0:
        print('Feature Selection - ' + bcl[0] +
              ':  - number of features reducing from ' + str(X.shape[1]) +
              ' to ' + str(m) + ' ...')
    if bcl[0] == 'Fisher':
        sel = sfsfisher(X, d, m)
    else:
        estimator = defineModel(bcl)
        sfs = SFS(estimator,
                  k_features=m,
                  forward=True,
                  floating=False,
                  verbose=show,
                  scoring='accuracy',
                  cv=cv)
        sfs = sfs.fit(X, d)
        sel = list(sfs.k_feature_idx_)
        if show > 0:
            print(' ')
        if show:
            plot_sfs(sfs.get_metric_dict(), kind='std_err')
            plt.title('Sequential Forward Selection')
            plt.grid()
            plt.show()
    return sel
def sfs_eval(model_,xtest):
    # get best features
    best_feats_idx = model_.best_estimator_['selector'].k_feature_idx_
    best_feats = list(xtest.columns[list(best_feats_idx)].values.tolist())
    print('\nBest features: \n{}'.format(best_feats))

    # plotting feature selection characteristics
    plot_sfs(model_.best_estimator_['selector'].get_metric_dict(), kind='std_err', figsize=(12, 5))
    plt.title('Sequential Forward Selection (w. StdErr)')
    plt.grid(b=True, which='major', axis='both')
    plt.show()
示例#3
0
def make_plot(sfs, k, weights, is_forward):
    fig = plot_sfs(
        sfs.get_metric_dict(),
        kind='std_dev',
    )

    axes = fig.add_subplot(111)
    a = axes.get_xticks().tolist()
    for i in range(len(a)):
        if i % 3 != 0:
            a[i] = ''
    axes.set_xticklabels(a)
    axes.tick_params(axis='both', which='major', labelsize=40)
    axes.tick_params(axis='both', which='minor', labelsize=40)

    fig.set_size_inches(20, 12, forward=True)
    plt.ylim([0.70, 1.00])
    plt.xlabel('Число признаков', fontsize=40)
    plt.ylabel(SCORING, fontsize=40)
    # plt.title('Последовательный отбор признаков (k = {0}, weights = {1})'.format(
    #         k, weights
    #     )
    # )
    # plt.grid(True)
    plt.savefig(
        '../../results/knn/features_selection/' +
        'knn_k={0}_weights={1}_forward={2}.svg'.format(k, weights, is_forward),
        format='svg',
        dpi=300)
示例#4
0
    def plot(self):
        if self.selector_name == 'rfe':
            step = self.selector.sep if self.selector.sep > 1 else int(
                self.selector.sep * self.dim)
            plt.figure(figsize=(12, 9))
            plt.xlabel(f'Number of features tested x {step}')
            plt.ylabel('Cross-validation score')
            plt.plot(range(1,
                           len(self.selector.grid_scores_) + 1),
                     self.selector.grid_scores_)
            # plt.savefig('ELO-lgbmcv-02.png', dpi=150)
            plt.show()

        plot_sfs(self.selector.get_metric_dict(), kind='std_dev')
        plt.ylim([0.8, 1])
        plt.title('Sequential Forward Selection (w. StdDev)')
        plt.show()
示例#5
0
def fse_sfs(bcl, X, d, m, cv=0, show=0):
    estimator = defineModel(bcl)
    sfs = SFS(estimator,
              k_features=m,
              forward=True,
              floating=False,
              verbose=2,
              scoring='accuracy',
              cv=cv)
    sfs = sfs.fit(X, d)
    sel = sfs.k_feature_idx_
    print(' ')
    if show:
        plot_sfs(sfs.get_metric_dict(), kind='std_err')
        plt.title('Sequential Forward Selection (w. StdErr)')
        plt.grid()
        plt.show()
    return sel
    def select_by_SFS(self, model=None):
        # 前向选择:该过程从一个空的特性集合开始,并逐个添加最优特征到集合中。
        # 展示:随着特征个数增加得分变化趋势图
        # 可选择K个最优特征
        selector = SFS(model,
                  k_features=self.K,
                  forward=True,
                  floating=False,
                  # scoring='neg_mean_squared_error',
                  cv=0)
        selector.fit(self.train_X, self.train_y)
        k_feature = selector.k_feature_names_
        print('selected features:', k_feature)
        print('selected index:', selector.k_feature_idx_)

        if self.showFig:
            model_name = str(model).split('(')[0]
            plot_sfs(selector.get_metric_dict(), kind='std_dev')
            plt.title('SFS of {}'.format(model_name))
            plt.grid()
            plt.show()
 def figs_of_SFS(self, model=None):
     selector = SFS(model,
                    k_features=self.K,
                    forward=True,
                    floating=False,
                    # scoring='neg_mean_squared_error',
                    cv=0)
     selector.fit(self.train_X, self.train_y)
     model_name = str(model).split('(')[0]
     fig1 = plot_sfs(selector.get_metric_dict(), kind='std_dev')
     plt.title('SFS of {}'.format(model_name))
     plt.grid()
     plt.show()
示例#8
0
def plot_feed_forward_models():
    """
    Plots the performance for each iteration of the feedforward model.
    The number of features chosen are 15 and 20, since these showed the best result

    """
    # create Linear Regression model
    regr = LinearRegression()

    sfs_model = SequentialFeatureSelector(regr,
                                          k_features=15,
                                          forward=True,
                                          floating=False,
                                          scoring='neg_mean_squared_error',
                                          cv=10)

    sfs_model = sfs_model.fit(X_train, y_train)
    plot_sfs(sfs_model.get_metric_dict(), kind='std_err')
    plt.title('Sequential Forward Selection Linear Regression (w. StdErr)')
    plt.grid()
    plt.show()

    # Same for the Decision Tree, with some different settings
    clf = tree.DecisionTreeClassifier()

    sfs_model = SequentialFeatureSelector(clf,
                                          k_features=20,
                                          forward=True,
                                          floating=False,
                                          scoring='accuracy',
                                          cv=10)
    sfs_model = sfs_model.fit(X_train, y_train_binned)
    plot_sfs(sfs_model.get_metric_dict(), kind='std_err')
    plt.title('Sequential Forward Selection Decision Tree (w. StdErr)')
    plt.grid()
    plt.show()
def sequential_feature_selection(data_set, y_values, want_graph):
    lr = LinearRegression()
    sfs = SFS(lr,
              k_features=13,
              forward=True,
              floating=False,
              scoring='neg_mean_squared_error',
              cv=10)
    sfs = sfs.fit(data_set, y_values)
    if want_graph:
        fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
        plt.title('Sequential Forward Selection (w. StdErr)')
        plt.grid()
        plt.show()

    return sfs
示例#10
0
y_pred = model.predict(X_test_scoring)
predictions = [round(value) for value in y_pred]
IG_Test_accuracy = accuracy_score(y_test_scoring, predictions)
print('Info Gain Accuracy (Test, Hold-Out): %.2f%%' % (Baseline_Test_accuracy * 100.0))

# WRAPPER-BASED FORWARD SEQUENTIAL SEARCH
#The Forward Seqeuntial Search will use Gradient Boost classifier and look at all the features added sequentially. Then, re-evaluate using the least amount of features which give the best accuracy.

# It doesn't appear to add any value past ~7 features, so change k_features to 7 if this runs slowly
sfs_forward = SFS(model,k_features=44,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1)
sfs_forward = sfs_forward.fit(X_train, y_train)

# This will create a graphic that shows performance (accuracy) as a solid blue line for each feature added, 
# and the feint blue is the standard error for that feature
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig1 = plot_sfs(sfs_forward.get_metric_dict(), kind='std_dev', figsize=(10, 5))
plt.ylim([0.5, 1])
plt.title('Sequential Forward Selection (Standard Error)')
plt.grid()
plt.show()

"""
    DISCUSSION
From the graph above, it would appear the 7 features would be the best model, after that the model performance again plateau's like with Information Gain. We will re-run the model using the 7 best features.
"""

# Rerun with 7 features
sfs_forward = SFS(model,k_features=7,forward=True, verbose=1, scoring='accuracy',cv=10, n_jobs =-1)
sfs_forward = sfs_forward.fit(X_train, y_train)

# Get the 7 features used
示例#11
0

cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1)
sfs_ridge_forward= SFS(Ridge(alpha=0.1),
          k_features=4,
          forward=True,
          floating=True,
          scoring = 'neg_mean_squared_error',
          verbose=2,
          cv = cv)
sfs_ridge_forward.fit(X_norm, y)
sfs_ridge_forward.k_feature_names_



fig1 = plot_sfs(sfs_ridge_forward.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.ylabel('Perfomance')
plt.grid()
plt.savefig("forward_processing_Porperty_ridge_"+name+".png", dpi=300)
plt.show()


X_selcted_columns= list(sfs_ridge_forward.k_feature_names_)
X_selected=X_norm[X_selcted_columns]
ridge=Ridge()
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=1)
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=cv)
result_ridge= ridge_regressor.fit(X_selected,y)
示例#12
0
y_btr = y[y == 1][:smpnum]
x_btr = x[y == 1][:smpnum]
for i in range(2, 6):
    x_btr = np.concatenate([x_btr, x[y == i][:smpnum]])
    y_btr = np.concatenate([y_btr, y[y == i][:smpnum]])

x_tr, x_te, y_tr, y_te = train_test_split(
    x_btr,
    y_btr,
    test_size=0.20,
)

best = do_sfs(x_tr, y_tr)

# examine the results
plot = plot_sfs(best.get_metric_dict())
plot[1].figure.savefig("SFS-" + str(n_features) + ".png")
for i in range(1, 11):
    best.get_metric_dict()[i]['avg_score']

test_svm(x_all, y_all)

# make a more select dataset
# Filter the rest of the data
x_obs, y_obs, x_nuls = load_data()
keep = list(best.k_feature_idx_)
np.save('sfs_features', keep)
# keep = np.load('sfs_features.npy')
x_obs = x_obs[:, keep]
x_nuls = x_nuls[:, keep]
示例#13
0

# **Best subset of Features selected after feature selection process.**

# In[199]:


f_selector.k_feature_names_


# **Plot of Number of Features v/s Performance of Regressor.**

# In[200]:


plot_sfs(f_selector.get_metric_dict(),kind='std_dev')


# Selecting the best subset of features and removing others from X_Train.

# In[201]:


feat_random_forest=list(f_selector.k_feature_names_)
X_train_rf=X_train_rf.loc[:,list(f_selector.k_feature_names_)]


# Using GridSearchCV for Hyperparameter Tuning.

# In[206]:
示例#14
0
def ExecuteSFFS(x, y, featureNames, featureList, clusters, clusterNames, svc,
                kFolds, nbOfSplit, featMaxNbrSFFS, standardizationType,
                removedData, permutation_flag, nbPermutation, balance_flag,
                currentDateTime, resultDir, debug_flag, verbose):
    import scipy
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split as tts
    from sklearn.metrics import confusion_matrix
    from mlxtend.feature_selection import SequentialFeatureSelector as SFS
    from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
    from sklearn.model_selection import RandomizedSearchCV

    from slpClass_toolbox import BalanceClasses
    from slpClass_toolbox import Standardize
    from slpClass_toolbox import Permute
    from slpClass_toolbox import ComputePermutationAvgDA
    from slpClass_toolbox import PlotPermHist
    from slpClass_toolbox import ApplyStandardization
    from slpClass_toolbox import plot_confusion_matrix

    plt.rcParams.update({'figure.max_open_warning': 0})

    # Get features values since SFFS works only with numpy array!
    bestFeaturesHist = np.zeros([len(featureNames)])
    CvResult = pd.DataFrame()
    permResults = pd.DataFrame()
    tmpBest = []
    DA = []
    avg_perm_DA = []
    skipFS = False  # flag to skip feature selection
    fitFeatOverTresh = False  # fit classifier with most frequent features in best set

    #********************** TRAIN pre-procesing *******************************
    for it in list(range(nbOfSplit)):
        print('\nSplit #{}'.format(str(it)))

        # Use all features or given ones only
        if len(featureList) == 0:
            xx = x
        elif isinstance(featureList[0], float):
            xx = x
            fitFeatOverTresh = True
        else:
            xx = x[featureList]
            skipFS = True

        # Balance the number of old woman and old man or not
        if balance_flag:
            X, Y = BalanceClasses(xx, y)
        else:
            X, Y = xx, y

        # slpit dataset into train and test random subset
        X_train, X_test, y_train, y_test = tts(X,
                                               Y['Cluster'],
                                               test_size=0.33,
                                               stratify=Y['Cluster'])
        # Data z-score standardisation
        xTrainSet, zPrm = Standardize(X_train, y_train, standardizationType,
                                      debug_flag)

        #**************************** SVM optimisation ************************
        params_dict = {
            'C': scipy.stats.expon(scale=100),
            'kernel': ['linear'],
            'class_weight': ['balanced', None]
        }

        n_iter_search = 20
        random_search = RandomizedSearchCV(svc,
                                           param_distributions=params_dict,
                                           n_iter=n_iter_search)

        random_search.fit(xTrainSet, y_train)
        optimClf = random_search.best_estimator_

        #*************************** TRAIN ************************************
        print('Fitting...')
        if skipFS:
            optimClf = optimClf.fit(xTrainSet.as_matrix(), y_train)

            yPred = optimClf.predict(xTrainSet.as_matrix())

            # Compute the accuracy of the test prediction
            acc = float((y_train == yPred).sum()) / yPred.shape[0]
            print('Train predicted accuracy: %.2f %%' % (acc * 100))
            fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)])

        else:
            # set k_features = (1,X.shape[1]) to test all possible combinations
            sffs = SFS(optimClf,
                       k_features=(1, featMaxNbrSFFS),
                       forward=True,
                       floating=False,
                       scoring='accuracy',
                       cv=kFolds,
                       n_jobs=-1)
            sffs = sffs.fit(xTrainSet.as_matrix(), y_train)

            print('Best combination for fit #%d (ACC: %.3f): %s' % \
                  (it,sffs.k_score_, sffs.k_feature_idx_))

            # Fit the estimator using the new feature subset and make a
            # prediction on the test data
            X_train_sfs = sffs.transform(xTrainSet.as_matrix())
            optimClf.fit(X_train_sfs, y_train)

            fitRes = pd.DataFrame.from_dict(sffs.get_metric_dict()).T
            fitRes['avg_over_std'] = fitRes['avg_score'] / fitRes['std_dev']

            if featMaxNbrSFFS > 1:
                # plot feature selection process metrics
                fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_err')
                savedPlotName = resultDir+'Decoding_accuracy_'+clusters+'_'+\
                                str(it)+'_'+str(nbOfSplit)+'.png'

                tmpBest.append(sffs.k_feature_idx_)
                bestFeaturesHist[[tmpBest[-1]]] += 1

                fig1.set_dpi(300)
                plt.tight_layout()
                plt.savefig(savedPlotName, bbox_inches='tight')
                plt.clf()
                plt.close(fig1)

                # plot mean / std
                plt.figure(dpi=300)
                plt.title('Moyenne sur ecart-type')
                plt.xlabel("nb attributs dans combinaison")
                plt.xticks(range(featMaxNbrSFFS))
                plt.ylabel("Moyenne sur ecart-type")
                plt.plot(list(range(1, featMaxNbrSFFS + 1)),
                         fitRes['avg_over_std'])
                figName = resultDir+'SFFS_'+clusters+'_bestSet_metric_'+ \
                          str(it)+'_'+str(nbOfSplit)
                plt.savefig(figName, bbox_inches='tight')
                plt.clf()
                plt.close()

        # add metrics iteration identifier
        fitRes = fitRes.add_suffix('_' + str(it + 1))

        CvResult = pd.concat([CvResult, fitRes], axis=1)

        #***************************** TEST ***********************************
        print('Testing...')
        # standardize test set using trainset standardization parameters
        xTestSet = ApplyStandardization(X_test, zPrm)

        # prepare test data
        if skipFS:
            xTest = xTestSet
            savedPlotName = resultDir+clusters+'_ConfusionMatrix_'+str(it+1)+ \
                            '_'+str(nbOfSplit)
        else:
            # Generate a new subset of data according to selected features
            xTest = sffs.transform(xTestSet.as_matrix())
            savedPlotName = resultDir+'SFFS_'+clusters+'_ConfusionMatrix_'+ \
                        str(it+1)+'_'+str(nbOfSplit)

        # actually test classifier and compute decoding accuracy on predictions
        y_pred = optimClf.predict(xTest)
        acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
        print('Test set accuracy: %.2f %%' % (acc * 100))
        DA.append(acc)  # stack test DA for further use

        # plot confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        fig_CM = plt.figure(dpi=300)
        plot_confusion_matrix(cm,
                              clusterNames,
                              title=savedPlotName,
                              normalize=True,
                              precision=2)
        plt.clf()
        plt.close(fig_CM)

        #**************** STATISTICAL ASSESSMENT (PERMUTATION) ****************
        if permutation_flag:
            permResults['permutation_DA_' + str(it)] = Permute(
                clusters,
                xTrainSet,
                xTestSet,
                y_train,
                y_test,
                nbPermutation,
                standardizationType,
                debug_flag=0)
            avg_perm_DA.append(
                np.mean(permResults['permutation_DA_' + str(it)]))

    dfDA = pd.DataFrame(data=DA, columns=['DA_test'])
    #    CvResult = pd.concat([CvResult, dfDA[:]], axis=1)
    CvResult = pd.concat([
        CvResult, dfDA[:],
        pd.DataFrame(data=[np.mean(DA)], columns=['avg_DA'])
    ],
                         axis=1)

    #***************** COMPUTE STATISTICAL ASSESSMENT RESULTS *****************
    if permutation_flag:
        # compute permutation DA average and keep results in a dataframe
        print('\nAverage permutation DA')
        for i in list(range(len(avg_perm_DA))):
            print('\t' + str(avg_perm_DA[i]))

        savedHistName = resultDir + 'Average_Permutation_hist_' + clusters + '.png'
        PlotPermHist(permResults, CvResult['avg_DA'].iloc[0], currentDateTime,
                     savedHistName)
        #formating permutation results to save in excel file
        permResults = pd.concat(
            [permResults, ComputePermutationAvgDA(avg_perm_DA)], axis=1)
        print('Mean permutation decoding accuracy : {}'.format(
            np.mean(permResults['Avg_Permutation_DA_per_epoch'])))
    else:  # binomial law
        from scipy.stats import binom
        q = 0.001  # p value
        n = X.shape[0] + 1  # nombre d'observation (sujets)
        p = 1 / len(clusterNames)  # probablité d'avoir un essai correctement
        luckLvl = pd.DataFrame(date=[binom.isf(q, n, p) / n],
                               columns=['Chance_Level'])

#****************************** Compute results *******************************
    if not skipFS:
        # Build structure of histogram data to save in excel
        hist = pd.DataFrame(data=featureNames, columns=['Features_Name'])
        hist['Occurence_Best'] = bestFeaturesHist
        # Search best set across every iteration best set
        best_Combination = tmpBest[np.argmax(DA)]
        # Compute average size of best combination
        l = 0
        for n in list(range(len(tmpBest))):
            l += len(tmpBest[n])
        avgBestCombSize = pd.DataFrame(data=[np.ceil(l / len(tmpBest))],
                                       columns=['avgBestCombSize'])

        #    subsetHist = GetSubsetOccurence(tmpBest)
        #    PlotHist(subsetHist[1],'Subsets occurences',subsetHist[0],'Comb_Hist.png')

        # Get best set's feature names
        tmp = []
        tmp.append(np.max(DA))
        for i in best_Combination:
            tmp.append(featureNames[i])
            print('\t' + featureNames[i])
        bestFeatNames = pd.DataFrame(data=tmp, columns=['Best_Features_Set'])

        sffsRes = pd.concat([hist, bestFeatNames, avgBestCombSize], axis=1)

        # Plot best combination custom metric (mean / std_dev)
        from slpClass_toolbox import PlotBestCombinationMetrics
        filteredData = CvResult.filter(regex=r'avg_over_std_', axis=1)
        metrics = pd.DataFrame(data=filteredData)
        metrics.dropna(inplace=True)
        figName = resultDir + 'SFFS_' + clusters + '_bestSet_metric_aggreg.png'
        PlotBestCombinationMetrics(metrics, figName)

    #save training and permutation results in an excel file
    nbSubject = pd.DataFrame(data=[len(X)], columns=['Number_Of_Subjects'])

    #************************ Build results structure *************************
    excelResults = pd.concat([
        CvResult, permResults if permutation_flag else luckLvl,
        sffsRes if not skipFS else None, removedData, nbSubject
    ],
                             axis=1)

    print('Mean Decoding accuracy :{}'.format(np.mean(DA)))

    # compute occurence of every subset in bestsets of every iteration
    #    from slpClass_toolbox import GetSubsetOccurence
    #    subsetHist = GetSubsetOccurence(tmpBest)
    #    excelResults = pd.concat([excelResults, subsetHist], axis=1)
    #    excelResults.to_excel(saveTo, sheet_name=xlSheetName)

    if fitFeatOverTresh:
        tresh = featureList[0] * nbOfSplit
        bestFeatColumns = hist.iloc[:, 0][hist.iloc[:, 1] > tresh]
        bestDataSet = xx[bestFeatColumns]
        classes = y
        DABestFeat = []
        print('Fitting with features occuring over %d times in best sets' %
              tresh)
        for i in list(range(nbOfSplit)):
            print('\rFit #{} of {}\n'.format(i + 1, nbOfSplit),
                  end='\r',
                  flush=True)
            # Balance the number of old woman and old man or not
            if balance_flag:
                XX, YY = BalanceClasses(bestDataSet, classes)
            else:
                XX, YY = bestDataSet, classes

            # slpit dataset into train and test random subset
            XXtrain, XXtest, yytrain, yytest = tts(XX,
                                                   YY['Cluster'],
                                                   test_size=0.33,
                                                   stratify=YY['Cluster'])
            # Data z-score standardisation
            xxTrainSet, zzPrm = Standardize(XXtrain, yytrain,
                                            standardizationType, debug_flag)

            # fit and predict on training data
            optimClf = optimClf.fit(xxTrainSet.as_matrix(), yytrain)
            yPred = optimClf.predict(xxTrainSet.as_matrix())
            # Compute accuracy of prediction on trainnnig set
            acc = float((yytrain == yPred).sum()) / yPred.shape[0]
            print('Train predicted accuracy: %.2f %%' % (acc * 100))
            fitRes = pd.DataFrame(data=[acc], columns=['CV_DA_' + str(it + 1)])

            # test classifier and compute decoding accuracy on predictions
            xxTestSet = ApplyStandardization(XXtest, zzPrm)
            yypred = optimClf.predict(xxTestSet)
            acc = float((yytest == yypred).sum()) / yypred.shape[0]
            print('Test set accuracy: %.2f %%' % (acc * 100))
            DABestFeat.append(acc)  # stack test DA for further use
            # plot confusion matrix
            cm = confusion_matrix(yytest, yypred)
            fig_CM = plt.figure(dpi=300)
            plot_confusion_matrix(cm,
                                  clusterNames,
                                  title=savedPlotName,
                                  normalize=True,
                                  precision=2)
            plt.clf()
            plt.close(fig_CM)
        df = pd.DataFrame(data=DABestFeat, columns=['optim DA'])
        df = pd.concat([
            df,
            pd.DataFrame(data=[np.mean(DABestFeat)], columns=['optim avg DA'])
        ],
                       axis=1)
        print('Classifier trained with best features (occ > %d) only' % tresh)
        print(df)
        excelResults = pd.concat([excelResults, df], axis=1)

    return excelResults
示例#15
0
selection_res = pd.DataFrame.from_dict(sfs4.get_metric_dict()).T
# print(selection_res)
selection_res.to_csv(
    "/Users/shuojiawang/Documents/ppdmodel/result1907/selection_log_withistoryrf.csv",
    sep='\t')

selected_feature_idx = result4.k_feature_idx_
#print(type(selected_feature_idx))
selected_feature = list(selected_feature_idx)
feature_name = []
for i in selected_feature:
    feature_name.append(feature_names[i])
print(feature_name)

fig = plot_sfs(sfs4.get_metric_dict(), kind='std_err')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.xlabel("Feature number")
plt.ylabel("AUC")
plt.grid()
#plt.savefig("Users/bu/Desktop/feature_selection.png", dpi=600)
plt.show()
#plt.clf()

from sklearn.model_selection import learning_curve
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(
    ensemble.RandomForestClassifier(),
    X,
    y,
    # Number of folds in cross-validation
示例#16
0
def feature_selection_mlextend(dataframe, target_feature_name, scoring_metric,
                               n_jobs, cross_val, range_of_features):
    """
    This function will take a dataframe as input and perform the following:

    a) Remove noise using Boruta Feature Selection
    b) Select the top features given a range of features to select

    :param dataframe: dataframe with features to select from
    :param target_feature_name: target name of the feature
    :param scoring_metric: f1_weighted, accuracy, roc_auc etc.
    :param cross_val: # of cross validation folds
    :param n_jobs: # of cpu jobs
    :param range_of_features: tuple of the range of features to select from (low, high)
    :return: dataframe with features reduced
    """

    if not isinstance(dataframe, pd.DataFrame):
        raise ValueError("Object passed is not a dataframe")

    if not isinstance(range_of_features, tuple):
        raise ValueError("range_of_features passed is not a tuple")

    import lightgbm as lgb

    classifier = lgb.LGBMClassifier(n_jobs=n_jobs,
                                    class_weight="balanced",
                                    max_depth=6,
                                    random_state=2019)

    x = dataframe.drop(target_feature_name, axis=1).values
    y = dataframe[target_feature_name].values.ravel()

    # forward selection

    sequential_forward_feature_selection = sfs(classifier,
                                               k_features=range_of_features,
                                               forward=True,
                                               n_jobs=n_jobs,
                                               floating=False,
                                               verbose=False,
                                               scoring=scoring_metric,
                                               cv=cross_val)

    sfs_algo = sequential_forward_feature_selection.fit(x, y)

    sfs_cross_val_score = sfs_algo.k_score_

    cross_val = round(sfs_cross_val_score, 2)

    selected_features = list(sfs_algo.k_feature_names_)

    print("Number of features selected is:  {}".format(
        len(sfs_algo.k_feature_names_)))

    print("Cross Validation Score for {}, is {}".format(
        scoring_metric, cross_val))

    plot_sfs(sfs_algo.get_metric_dict(), kind='std_err', figsize=(11, 7))

    df = x[selected_features]

    # merge back the target variable to the dataframe(df)
    df = df.merge(y, on=y.index)

    # drop generated index
    df.drop("key_0", axis=1, inplace=True)

    cat_features = []

    for col_name in df.columns:
        if df[col_name].dtype != 'float64':
            if col_name != 'loan_status':
                cat_features.append(col_name)

    print(cat_features)

    return df
示例#17
0
cols=x.columns.tolist()
lr=LinearRegression()
import matplotlib.pyplot as plt
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
sfs = SFS(lr,
          k_features=10,
          forward=True,
          floating=False,
          scoring='neg_mean_squared_error',
          cv=20)
#without Autoregression
sfs = sfs.fit(x, y)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
#Autoregression with time one hour
sfs_1= sfs.fit(X_t_1,Y_tplus1 )
fig = plot_sfs(sfs_1.get_metric_dict(), kind='std_err')
#Autoregression with time two hours
sfs_2= sfs.fit(X_t_2,Y_tplus2 )
fig = plot_sfs(sfs_2.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()
print(sfs.k_feature_names_)
print(sfs.k_score_)
print(sfs.subsets_)

import pandas as pd
mask = selector.support_
print(f"Best features according to RFE {X_m.columns[mask].values}")

X_m1 = X_m.iloc[:,mask]
# We could have used train test split or cross validation strategies
# for scoring the model but in order to compare with the stats model 
# we will use the whole data
model1 = LinearRegression().fit(X_m1,y_m)
print(f"R2 Score: {model1.score(X_m1,y_m)}")

"""### Forward Selection"""

model = LinearRegression(fit_intercept=False)
sfs1 = sfs(model,k_features=20,forward=True,scoring='r2',cv=5)
sfs1.fit(X_m,y_m)
fig = plot_sfs(sfs1.get_metric_dict())
plt.title('Forward Selection')
plt.grid()
plt.show()

print(sfs1.k_features, sfs1.k_feature_names_,sep="\n")

index = list(sfs1.k_feature_idx_)
X_m1 = X_m.iloc[:,index]
model1 = LinearRegression().fit(X_m1,y_m)
print(f"R2 Score: {model1.score(X_m1,y_m)}")

"""## Regularization
1. Lasso
2. Ridge
3. ElasticNet
示例#19
0
sfs1 = SFS(estimator=classifier_, 
           k_features=(5, 30),
           forward=True, 
           floating=False, 
           scoring='accuracy',
           cv=3)

pipe = make_pipeline(StandardScaler(), sfs1)

pipe.fit(X_train, y_train)

print('best combination (ACC: %.3f): %s\n' % (sfs1.k_score_, sfs1.k_feature_idx_))
print('all subsets:\n', sfs1.subsets_)

from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
plot_sfs(sfs1.get_metric_dict(), kind='std_err');



selected_features1 = list(sfs1.k_feature_names_)
# save the model to disk
model = LogisticRegression()
model.fit(X_train, Y_train)
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)

#%%
# Feature Importance
示例#20
0
    def change(self, x_train, y_train, percetage, mnb, change_plan):
        number_change_requested = int(percetage / 100 * x_train.shape[0])
        print("{} percentage error is equal to {} change \n".format(
            percetage, number_change_requested))

        #find the most important feature

        sfs = SFS(mnb,
                  k_features=len(x_train[0]),
                  forward=True,
                  floating=False,
                  verbose=2,
                  scoring='accuracy',
                  cv=5)
        pipe = make_pipeline(StandardScaler(), sfs)
        pipe.fit(x_train, y_train)

        #-------------plotting------------------
        fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')
        plt.show()

        #get future of the sfs order and only change them.
        x_train_changed = np.copy(x_train)
        used_row = {}
        all_changed = 1

        for i in range(len(change_plan["key"])):

            occurred_change = 0
            indices = [
                t for t, x in enumerate(y_train)
                if x == change_plan["key"][i][0]
            ]

            print("{} rows have target {} \n".format(len(indices),
                                                     change_plan["key"][i][0]))

            for L in range(1, len(sfs.subsets_) + 1):  #number of the features
                subset = list(sfs.subsets_[L]['feature_idx'])

                if (occurred_change == change_plan["number"][i]):
                    break
                print("change feature index {} ----".format(subset))
                for p in range(len(indices)):
                    x_train_changed[indices[p]][subset] = 0

                    if y_train[indices[p]] == mnb.predict(
                        [x_train[indices[p]]]) and indices[p] not in used_row:

                        if (change_plan["key"][i][1] == mnb.predict(
                            [x_train_changed[indices[p]]])[0]):

                            print(
                                "with change features index {} row number {} has been changed"
                                .format(subset, indices[p]))
                            print(x_train[indices[p]],
                                  mnb.predict([x_train[indices[p]]])[0])
                            print(
                                x_train_changed[indices[p]],
                                mnb.predict([x_train_changed[indices[p]]])[0])

                            print(
                                " \n change number {} \n".format(all_changed))
                            used_row.update({indices[p]: indices[p]})
                            occurred_change = occurred_change + 1
                            all_changed = all_changed + 1

                            if (occurred_change == change_plan["number"][i]):
                                print("part of your request has been done :)")
                                break
                        else:
                            x_train_changed[indices[p]] = np.copy(
                                x_train[indices[p]])
                    else:
                        x_train_changed[indices[p]] = np.copy(
                            x_train[indices[p]])

            #check for rest of the possible changes

            # for LL in range(0, len(x_train_changed[0]) + 1):
                print(
                    "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
                )

                for subsets in Change_Combination.combinations_index(
                        self, x_train_changed[0], L):
                    if (subset != subsets):
                        if not subsets:
                            pass
                        else:
                            if (occurred_change == change_plan["number"][i]):
                                #print("part of your request has been done :))))")
                                break
                            print(
                                "change feature index {} ----".format(subsets))
                            for pp in range(len(indices)):
                                x_train_changed[indices[pp]][subsets] = 0

                                if y_train[indices[pp]] == mnb.predict([
                                        x_train[indices[pp]]
                                ]) and indices[pp] not in used_row:

                                    if (change_plan["key"][i][1] ==
                                            mnb.predict([
                                                x_train_changed[indices[pp]]
                                            ])[0]):

                                        print(
                                            "with change features index {} row number {} has been changed"
                                            .format(subsets, indices[pp]))
                                        print(
                                            x_train[indices[pp]],
                                            mnb.predict([x_train[indices[pp]]
                                                         ])[0])
                                        print(
                                            x_train_changed[indices[pp]],
                                            mnb.predict([
                                                x_train_changed[indices[pp]]
                                            ])[0])

                                        print(" \n change number {} \n".format(
                                            all_changed))
                                        used_row.update(
                                            {indices[pp]: indices[pp]})
                                        occurred_change = occurred_change + 1
                                        all_changed = all_changed + 1

                                        if (occurred_change ==
                                                change_plan["number"][i]):
                                            print(
                                                "part of your request has been done :)"
                                            )
                                            break
                                    else:
                                        x_train_changed[indices[pp]] = np.copy(
                                            x_train[indices[pp]])
                                else:
                                    x_train_changed[indices[pp]] = np.copy(
                                        x_train[indices[pp]])
                    else:
                        print(
                            "subsets are equal {}----------------------------------------------"
                            .format(subsets))

        if (all_changed <= number_change_requested):
            print("your request doesn't complete! please change your plan")
        else:
            print("your request is done :)")

        return np.copy(x_train_changed)
示例#21
0
y = imp_2.transform(y)

y = y.reshape(-1)

#特徵縮放
#-----------------------------------------------------------------------------------------------------------------------
sc = preprocessing.StandardScaler()
sc.fit(X)
X = sc.transform(X)
#-----------------------------------------------------------------------------------------------------------------------

#rbf_svr = SVR(kernel='rbf', C=1e3)
RF = RandomForestRegressor(n_estimators=10, criterion='mse', random_state=14)

sfs = SFS(
    RF,
    k_features=10,
    forward=True,
    floating=False,
    scoring=
    'r2',  #{'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', 'median_absolute_error', 'r2'}
    cv=10)  # n_jobs=-1 means all CPUs

sfs = sfs.fit(X, y)
fig = plot_sfs(sfs.get_metric_dict(),
               kind='std_dev')  #{'std_dev', 'std_err', 'ci', None}.

plt.title('Sequential Forward Selection (季風)')
plt.grid()
plt.show()
示例#22
0
def sequential_feature_selector(features, labels, classifier, k_features, kfold, selection_type, plot=True, **kwargs):
    """Sequential feature selection to reduce the number of features.

    The function reduces a d-dimensional feature space to a k-dimensional
    feature space by sequential feature selection. The features are selected
    using ``mlxtend.feature_selection.SequentialFeatureSelection`` which
    essentially selects or removes a feature from the d-dimensional input space
    until the preferred size is reached.

    The function will pass ``ftype='feature'`` and forward ``features`` on to a
    classifier's ``static_opts`` method.

    Args:
        features: The original d-dimensional feature space
        labels: corresponding labels
        classifier (str or object): The classifier which should be used for
            feature selection. This can be either a string (name of a classifier
            known to gumpy) or an instance of a classifier which adheres
            to the sklearn classifier interface.
        k_features (int): Number of features to select
        kfold (int): k-fold cross validation
        selection_type (str): One of ``SFS`` (Sequential Forward Selection),
            ``SBS`` (Sequential Backward Selection), ``SFFS`` (Sequential Forward
            Floating Selection), ``SBFS`` (Sequential Backward Floating Selection)
        plot (bool): Plot the results of the dimensinality reduction
        **kwargs: Additional keyword arguments that will be passed to the
            Classifier instantiation

    Returns:
        A 3-element tuple containing

        - **feature index**: Index of features in the remaining set
        - **cv_scores**: cross validation scores during classification
        - **algorithm**: Algorithm that was used for search

    """

    # retrieve the appropriate classifier
    if isinstance(classifier, str):
        if not (classifier in available_classifiers):
            raise ClassifierError("Unknown classifier {c}".format(c=classifier.__repr__()))

        kwopts = kwargs.pop('opts', dict())
        # opts = dict()

        # retrieve the options that we need to forward to the classifier
        # TODO: should we forward all arguments to sequential_feature_selector ?
        opts = available_classifiers[classifier].static_opts('sequential_feature_selector', features=features)
        opts.update(kwopts)

        # XXX: now merged into the static_opts invocation. TODO: test
        # if classifier == 'SVM':
        #     opts['cross_validation'] = kwopts.pop('cross_validation', False)
        # elif classifier == 'RandomForest':
        #     opts['cross_validation'] = kwopts.pop('cross_validation', False)
        # elif classifier == 'MLP':
        #     # TODO: check if the dimensions are correct here
        #     opts['hidden_layer_sizes'] = (features.shape[1], features.shape[2])
        # get all additional entries for the options
        # opts.update(kwopts)

        # retrieve a classifier object
        classifier_obj = available_classifiers[classifier](**opts)

        # extract the backend classifier
        clf = classifier_obj.clf
    else:
        # if we received a classifier object we'll just use this one
        clf = classifier.clf


    if selection_type == 'SFS':
        algorithm = "Sequential Forward Selection (SFS)"
        sfs = SFS(clf, k_features, forward=True, floating=False,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    elif selection_type == 'SBS':
        algorithm = "Sequential Backward Selection (SBS)"
        sfs = SFS(clf, k_features, forward=False, floating=False,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    elif selection_type == 'SFFS':
        algorithm = "Sequential Forward Floating Selection (SFFS)"
        sfs = SFS(clf, k_features, forward=True, floating=True,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    elif selection_type == 'SBFS':
        algorithm = "Sequential Backward Floating Selection (SFFS)"
        sfs = SFS(clf, k_features, forward=True, floating=True,
                verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)

    else:
        raise Exception("Unknown selection type '{}'".format(selection_type))


    pipe = make_pipeline(StandardScaler(), sfs)
    pipe.fit(features, labels)
    subsets = sfs.subsets_
    feature_idx = sfs.k_feature_idx_
    cv_scores = sfs.k_score_

    if plot:
        fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
        plt.ylim([0.5, 1])
        plt.title(algorithm)
        plt.grid()
        plt.show()

    return feature_idx, cv_scores, algorithm, sfs, clf
示例#23
0
x_scaled_np = StandardScaler().fit_transform(x_data)
x_scaled_np = PolynomialFeatures(degree=2).fit_transform(x_scaled_np)

print(y)
print(x_scaled_np)

cv = RepeatedKFold(n_splits=5, n_repeats=20)

bins = np.linspace(y.min(), y.max(), 5)
labels = ["1", "2", "3", "4"]
Y_groups = pd.cut(y, bins)

sfs = SFS(regr, floating=True, verbose=2,
          k_features=2, forward=False,
          n_jobs=2,
          scoring='neg_mean_absolute_error', cv=cv)

sfs.fit(x_scaled_np, y)

print("Optimal number of features : %d" % sfs.k_features)
print('Best features :', sfs.k_feature_names_)
print('Best score :', sfs.k_score_)
print(sfs.get_params())
print(sfs)

fig1 = plot_sfs(sfs.get_metric_dict(),
                kind='std_dev',
                figsize=(6, 4))
plt.show()
示例#24
0
        dic[i] = rfe.score()
    plt.xlabel('feature_num')
    plt.ylabel('score')
    plt.plot(dic.keys(), dic.values())
    plt.show()
    return dic


if __name__ == "__main__":
    train_data = load_data(train_url)
    train_y = train_data['price']
    train_data.drop(['SaleID'], axis=1, inplace=True)
    train_data.drop(['price'], axis=1, inplace=True)
    col_name = [
        'name', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
        'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_3', 'v_12',
        'usedTime'
    ]
    sfs = SFS(LinearRegression(),
              k_features=13,
              forward=True,
              floating=False,
              scoring='r2',
              cv=0)
    train_data = train_data.fillna(0)
    sfs.fit(train_data, train_y)
    print(sfs.k_feature_names_)
    print(pd.DataFrame.from_dict(sfs.get_metric_dict()).T)
    fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
    plt.grid()
    plt.show()
示例#25
0
selector.k_feature_idx_
selector.k_feature_names_
selector.k_score_


pd.DataFrame.from_dict(selector.get_metric_dict()).T




# Backward Selection
select_back = SequentialFeatureSelector(knn_pipe, k_features=3, forward=False,
                                        floating=False, verbose=2, scoring='accuracy',
                                        cv=5, n_jobs=1)


select_back.fit(X=X, y=y)



# Plot results of Feature Selection (using `mlxtend`)

from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

fig1 = plot_sfs(selector.get_metric_dict(), kind='std_dev')
plt.ylim([0.8, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show();