def correlation_filter(): ''' 1.2、相关性过滤 我们希望选出与标签相关且有意义的特征,有三种常用的方法来评判特征与标签之间的相关性:卡方,F检验,互信息。 :return: ''' ## 导入手写数字识别数据集 data = pd.read_csv("./digit recognizor.csv") X = data.iloc[:, 1:] y = data.iloc[:, 0] print(X.shape) ## 卡方过滤:计算每个非负特征和标签之间的卡方统计量,并依照卡方统计量由高到低为特征排名。 # 删除小于所有特征中位数的方差的特征,删除后剩余一半特征 X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X) print(X_fsvar.shape) # 假设在这里需要300个特征 X_fschi = SelectKBest(chi2, k=300).fit_transform(X_fsvar, y) print(X_fschi.shape) print( cross_val_score(RFC(n_estimators=10, random_state=0), X_fschi, y, cv=5).mean()) # 学习超参数K # score = [] # for i in range(390, 200, -10): # X_fschi = SelectKBest(chi2, k=i).fit_transform(X_fsvar, y) # once = cross_val_score(RFC(n_estimators=10, random_state=0), X_fschi, y, cv=5).mean() # score.append(once) # plt.plot(range(390, 200, -10), score) # plt.show() # 图像一直上升 # 根据卡方值和P值确定K值 chivalue, pvalues_chi = chi2(X_fsvar, y) print(chivalue, pvalues_chi) # k取多少?我们想要消除所有p值大于设定值,比如0.05或0.01的特征: k = chivalue.shape[0] - (pvalues_chi > 0.05).sum() print(k) # 392 ## F检验,又称ANOVA,方差齐性检验,是用来捕捉每个特征与标签之间的线性关系的过滤方法。 F, pvalues_f = f_classif(X_fsvar, y) print(F, pvalues_f) k = F.shape[0] - (pvalues_f > 0.05).sum() print(k) # 392 ## 互信息法,它是用来捕捉每个特征与标签之间的任意关系(包括线性和非线性关系)的过滤方法。 # 它返回“每个特征与目标之间的互信息量的估计”,这个估计量在[0,1]之间取值,为0则表示两个变量独立,为1则表示两个变量完全相关。 result = MIC(X_fsvar, y) k = result.shape[0] - sum(result <= 0) print(k) # 392
def aveMI(X, Y): MI = MIC(X, Y) return np.nanmean(MI)
# F检验: from sklearn.feature_selection import f_classif F, pvalues_f = f_classif(X_fsvar, y) k = F.shape[0] - (pvalues_f > 0.05).sum() X_fsF = SelectKBest(f_classif, k=392).fit_transform(X_fsvar, y) cross_val_score(RFC(n_estimators=10, random_state=0), X_fsF, y, cv=5).mean() # In[]: # 互信息: # ''' # 消耗大 from sklearn.feature_selection import mutual_info_classif as MIC result = MIC(X_fsvar, y) k = result.shape[0] - sum(result <= 0) # 392 # In[]: X_fsmic = SelectKBest(MIC, k=392).fit_transform(X_fsvar, y) cross_val_score(RFC(n_estimators=10, random_state=0), X_fsmic, y, cv=5).mean() # ''' # In[]: # Embedded嵌入法: from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier as RFC import numpy as np import matplotlib.pyplot as plt # 随机森林实例化 RFC_ = RFC(n_estimators=10, random_state=0)
import matplotlib.pyplot as plt file = ZipFile('./digit recognizor.zip') f = file.open('digit recognizor.csv') df = pd.read_csv(f) f.close() file.close() df.info() x = df.iloc[:, 1:] y = df.iloc[:, 0] selector = VarianceThreshold(np.median(x.var().values)) # 先过滤一半特征 result = selector.fit_transform(x) print(df.shape, result.shape) tmp = MIC(result, y) # 互信息量=0则独立,为1则相关, # 这里取的意思是和标签是否独立或相关,与结果标签独立则说明该特征是无效特征 k = tmp.shape[0] - sum(tmp <= 0) result2 = SelectKBest(MIC, k=k).fit_transform(result, y) # 按照卡方值过滤 print(result2.shape) # 画出特征数量和精确度的图像 score = [] r = range(350, 250, -10) for i in r: result2 = SelectKBest(MIC, k=i).fit_transform(result, y) score.append( cross_val_score(RandomForestClassifier(n_estimators=10, random_state=0), result2,
y_test = test.iloc[:, 1] print(X_test.shape) scaler = preprocessing.StandardScaler().fit(X) X_data_transformed = scaler.transform(X) X_data_transformed = pd.DataFrame(X_data_transformed) X_data_transformed.columns = X.columns X_data = X_data_transformed scaler = preprocessing.StandardScaler().fit(X_test) X_test_transformed = scaler.transform(X_test) X_test_transformed = pd.DataFrame(X_test_transformed) X_test_transformed.columns = X_test.columns X_test = X_test_transformed # # ################ MIC ########## result = MIC(X_data, y_data, random_state=100) k = result.shape[0] - sum(result <= 0) Select = SelectKBest(MIC, k=k) Select.fit(X_data, y_data) X_new = Select.transform(X_data) # X_new = SelectKBest(chi2,k=131).fit_transform(X_data, y_data) # X_data=X_new print(X_new.shape) X = X_data.T X_data = X[Select.get_support()].T print(X_data.shape) X1 = X_test.T X_test = X1[Select.get_support()].T print(X_test.shape)
# pvalue < 0.05(或者0.01) , 拒绝原假设(特征X与标签Y独立) , 接受备用假设(X与Y相关) # independentColumnsNo_of_chi = (pValue > 0.05).sum() # k_best_chi = len(chiValue) - independentColumnsNo_of_chi # print(f'No. of ignored columns = {independentColumnsNo_of_chi}') # print(f'Best k_best = {k_best_chi}') # To determine best k in chi2() , we draw the learning curve # score_list = [] # k_range = range(390, 150, -10) # for i in k_range: # X_chi2_plot = SelectKBest(chi2, k=i).fit_transform(X_var_2, Y) # once = cross_val_score(RFC(n_estimators=10, random_state=0), X_chi2_plot, Y.flatten(), cv=5).mean() # score_list.append(once) # plt.plot(k_range, score_list) # plt.show() # F检验 # F , p_value = f_classif(X_var_2,Y.flatten()) # independentColumnsNo_of_F_classif = (p_value>0.05).sum() # k_best_F = len(F) - independentColumnsNo_of_F_classif # print(f'independentColumnsNo_of_F_classif = {independentColumnsNo_of_F_classif}') # print(f'k_best_F = {k_best_F}') # 互信息法 result = MIC(X_var_2, Y.flatten()) colno = sum(result <= 0) k_best_mutual = len(result) - colno print(f'colno={colno}') print(f'k_best_mutual={k_best_mutual}')
def mRmR(X, Y, clf, n): """ Feature Subset Selection Via Ensemble Method 'Max-Relevance, Min-Redundancy' Works only for continuous features, categorical labels. Params: X -> A np.array (2D) object representing the feature vector. Each Column represents a feature, while each row represents a sample. Y -> A np.array (1D) object representing the pattern class. n -> Maximum number of features to select. clf -> Selected classifier as wrapper. """ candidate_feature_indices = np.arange(X.shape[-1]) feature_sets = [] # Phase 1: Create Sequential Feature Sets [S1, S2, S3, ... Sn] # for i in range(n): print('Computing Feature Set S%s' % (i + 1)) relevance = MID(X[:, candidate_feature_indices], Y) redundancy = np.zeros(len(relevance)) try: for k in feature_sets[i - 1]: redundancy += MIC(X[:, candidate_feature_indices], X[:, k]) redundancy /= len(redundancy) except: pass # feature_sets -> Empty list score = relevance - redundancy best_feature_index = np.argmax(score) if feature_sets: feature_sets.append( feature_sets[-1] + [candidate_feature_indices[best_feature_index]]) else: feature_sets.append( [candidate_feature_indices[best_feature_index]]) candidate_feature_indices = np.delete(candidate_feature_indices, best_feature_index) # Phase 2: Validate Feature Set Performance # feature_set_scores = [] for feature_set in feature_sets: kf = KFold(n_splits=5) avg_accuracy = 0 for train_index, test_index in kf.split(X, Y): clf.fit(X[train_index][:, feature_set], Y[train_index]) avg_accuracy += clf.score(X[test_index][:, feature_set], Y[test_index]) feature_set_scores.append(avg_accuracy / 5) # Phase 3: Find Best Possible Subspace, For The Best Calculated Feature Space Sk # best_feature_subset = feature_sets[np.argmax(feature_set_scores)] best_subset_score = np.max(feature_set_scores) found_better_subset = True while found_better_subset and len(best_feature_subset) > 1: feature_subsets = [ best_feature_subset[:k] + best_feature_subset[k + 1:] for k in range(len(best_feature_subset)) ] feature_subset_scores = [] for feature_set in feature_subsets: kf = KFold(n_splits=5) avg_accuracy = 0 for train_index, test_index in kf.split(X, Y): clf.fit(X[train_index][:, feature_set], Y[train_index]) avg_accuracy += clf.score(X[test_index][:, feature_set], Y[test_index]) feature_subset_scores.append(avg_accuracy / 5) if np.any(feature_subset_scores > best_subset_score): best_subset_score = np.max(feature_subset_scores) best_feature_subset = feature_subsets[np.argmax( feature_subset_scores)] else: found_better_subset = False return best_feature_subset