def get_cascaded_sel_idx(high_th_year, low_th_year, feature_list, set_feature, sel_feature_num, div_ratio=4): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) #trn_x, trn_y, val_x, val_y = get_train_val(high_risk_group, low_risk_group) trn_x, trn_y = helper.get_train( high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed) #without validation set if len(set_feature): trn_x = trn_x[:, set_feature] #val_x = val_x[:,set_feature] feature_num = trn_x.shape[1] if sel_feature_num == 0: sel_gene_num = int( max(sel_feature_num, feature_num / div_ratio)) else: sel_gene_num = sel_feature_num sort_idx = fisher_score.fisher_score(trn_x, trn_y, mode='index') sel_idx = sort_idx[:sel_gene_num] return sel_idx
def fisherProc(X,y): # obtain the score of each feature on the training set score = fisher_score.fisher_score(X, y) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) return idx
def fisher_score_ranking(num_features=None): # NOTE: For BigArt. Fisher Score ranking of all features. These features # are passed on to LGBM in best model. n = 10 # Five vars: (15, 3.5) figsize = (15, 7) show = False path_to_figure = '../feature_importances/fisher_score_ranking.pdf' path_to_scores = './../../data_source/results/feature_importance/fisher_feat_ranks.npy' path_to_features = '../../data_source/to_analysis/compressed_features/all_features_orig_images_icc_dropped.csv' path_to_target = '../../data_source/to_analysis/target_dfs.csv' path_to_clinical = '../../data_source/to_analysis/clinical_params.csv' X = pd.read_csv(path_to_features,index_col=0) y = pd.read_csv(path_to_target, index_col=0) y = np.squeeze(y.values) clinical = pd.read_csv(path_to_clinical, index_col=0) scaler = StandardScaler() X_std = scaler.fit_transform(X) scores = fisher_score(X_std, y) np.save(path_to_scores, scores) ranks = extract_ranks(scores, X.columns, clinical) ranks.sort_values('ranks', ascending=False, inplace=True) plt.figure(figsize=figsize) plt.xlabel('Fisher Score') plot_feature_ranking(ranks, n=n, show=show, path_to_figure=path_to_figure)
def fisher_feature_reduction(self,down,up) : #importance of attributes are measured and listed score = fisher_score.fisher_score(self.class_train.values[down:up,:-1], self.class_train.iloc[down:up,-1]) #attributes are saved to dictionary with their importance value in cumulative way self.to_dict(score) return score
def fisher(data): rank = [] for i in range(6): X = data[i][:, :-1] Y = data[i][:, -1] score = fisher_score.fisher_score(X, Y) idx1 = fisher_score.feature_ranking(score) idx = samp(idx1.tolist()) rank.append(idx) R = rankaggregate(rank) return R
def fisher(): before = datetime.datetime.now() result = fisher_score.fisher_score(data, labels, mode="index") after = datetime.datetime.now() print("Fisher") result = result[:treshold] print(len(result)) print("cas: " + str(after - before)) print('\n') if len(result) < len(header): transform_and_save(result, "Fisher")
def seleciona_caracteristicas(vetor_caracteristicas, classes): caracteristicas_selecionadas = [] limiar_consideracao = 0 score = fisher_score.fisher_score(vetor_caracteristicas, classes) rank = fisher_score.feature_ranking(score) features_consideradas = conta_features_limiar(score, limiar_consideracao) if features_consideradas > 1: rank_considerado = rank[0:features_consideradas:1] caracteristicas_selecionadas = vetor_caracteristicas[:, rank_considerado] return caracteristicas_selecionadas, rank_considerado
def get_sel_idx(high_th_year, low_th_year, feature_list, sel_feature_num): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) trn_x, trn_y = helper.get_train( high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed) #without validation set sort_idx = fisher_score.fisher_score(trn_x, trn_y, mode='index') #sort_idx = f_score.f_score(trn_x, trn_y, mode='index') return sort_idx[:sel_feature_num]
def get_fisher_scores(self, max_dim): """ Получить меру Фишера и качество распознавания на основе AUC ROC. Выполняется отбор признаков для размерностей пространства признаков от 1 до max_dim. Для каждой размерности выполняется перекрестная проверка (cross-validation) и вычисляется интегральное значение меры Фишера и среднее по всем подвыборкам значение меры AUC ROC. Args: max_dim(int): число признаков до которого следует производить отбор. Returns: fisher_summary_scores: - вычисленные суммарные значения меры Фишера. auc_roc_scores: - вычисленные значения площади под кривой ROC. """ x_train = scale(self.features) # normalize features y_train = self.targets # target ids # Fisher score estimation f_score = fisher_score.fisher_score( x_train, y_train) # calculate Fisher score value ranked_f_score = fisher_score.feature_ranking(f_score) # rank features print('Последовательность отобранных коэффициентов:') print(*list(self.feature_header[ranked_f_score[0:max_dim]]), sep=', ') fisher_summary_scores = list( it.accumulate( f_score[ranked_f_score[0:max_dim]])) # integral Fisher scores # Cross validation k_fold = KFold(n_splits=5, shuffle=True) # setup cross-validation pattern ar_scorer = make_scorer(roc_auc_score) # make scorer clf = SGDRegressor(max_iter=100, tol=1e-3, random_state=241 ) # stochastic gradient descend regression as a clf auc_roc_scores = [] # list for AUC ROC values for i in range(1, max_dim + 1): # iterate by number of features selected features = x_train[:, ranked_f_score[0:i]] # select features t = y_train vect_auc_roc_score = cross_val_score(clf, features, t, scoring=ar_scorer, cv=k_fold) # train auc_roc_scores.append(np.mean(vect_auc_roc_score) ) # add mean (over CV-subsets) AUC ROC value return fisher_summary_scores, auc_roc_scores
def run_fold(trial,P,X,y,method,dataset,parttype): print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial) n_samples, n_features = X.shape train = P[:,trial] == 1 trnX = X[train] trnY = y[train] start_time = time.time() if method == 'fisher': score = fisher_score.fisher_score(trnX,trnY) features = fisher_score.feature_ranking(score) elif method == 'chi2': score = chi_square.chi_square(trnX,trnY) features = chi_square.feature_ranking(score) elif method == 'relieff': score = reliefF.reliefF(trnX,trnY) features = reliefF.feature_ranking(score) elif method == 'jmi': features = JMI.jmi(trnX,trnY, n_selected_features=n_features) elif method == 'mrmr': features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features) elif method == 'infogain': features = MIM.mim(trnX,trnY,n_selected_features=n_features) elif method == 'svmrfe': features = svmrfe(trnX,trnY) elif method == 'hdmr': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) elif method == 'hdmrhaar': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) else: print(method + 'does no exist') cputime = time.time() - start_time print features print 'cputime %f' % cputime return {'features': features, 'cputime': cputime}
def get_fisher_score(data,label,k = 30): score = fisher_score.fisher_score(data, label) #print(score) ranking = fisher_score.feature_ranking(score) #print(idx) dfscores = pd.DataFrame(score) dfcolumns = pd.DataFrame(data.columns) #df_rank =pd.DataFrame(idx) featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Feature','Score'] #naming the dataframe columns #print(featureScores.nlargest(k,'Score')) #print 20 best features result = featureScores.nlargest(k,'Score') return result, ranking
def weight(): # x_train, datamat, y_train,labelmat = cross_validation.train_test_split(comtest.iloc[0:len(comtest),1:comtest.shape[1]-1],comtest.iloc[0:len(comtest),-1], test_size = 0.2,random_state = j) # datamat=np.array(datamat,dtype=np.float) # labelmat=np.array(labelmat,dtype=np.int) datamat=np.array(comtest.iloc[0:len(comtest),1:comtest.shape[1]-1],dtype=np.float) #提取病例数据及其标签 labelmat=np.array(comtest.iloc[0:len(comtest),-1],dtype=np.int) datamat=preprocess(datamat) for i in range(len(labelmat)): if labelmat[i]==0: labelmat[i]=-1;#adaboost只能区分-1和1的标签 Relief = reliefF.reliefF(datamat, labelmat) #计算Relieff下的特征权重 print('Relief, 第%s次验证 '%(1)) Fisher= fisher_score.fisher_score(datamat, labelmat) #计算fisher下的特征权重 print('Fisher, 第%s次验证 '%(1)) gini= gini_index.gini_index(datamat,labelmat) #计算gini下的特征权重 gini=-gini print('gini, 第%s次验证 '%(1)) print("done_ %s" ) return Relief, Fisher, gini
def naiveBayes(processed_train_features, processed_valid_features, train_labels, valid_labels, processed_test_features, test_labels): model1 = GaussianNB() model1.fit(processed_train_features, train_labels) naive_bayes_predict_train = model1.predict(processed_train_features) naive_bayes_predict_valid = model1.predict(processed_valid_features) #print("Naive Bayes Training accuracy ",accuracy_score(train_labels, naive_bayes_predict_train)) print("Naive Bayes Valid accuracy ", accuracy_score(valid_labels, naive_bayes_predict_valid)) naive_bayes_predict_train_before_fisher = model1.predict( processed_test_features) print("Naive Bayes Testing accuracy ", accuracy_score(test_labels, naive_bayes_predict_train_before_fisher)) XFisher = processed_test_features.to_numpy() score = fs.fisher_score(XFisher, test_labels) ranked_featrues = fs.feature_ranking(score) topFeatures = ranked_featrues[:50] print(topFeatures) print(score.shape) print(XFisher.shape) intersection_cols = topFeatures colnamelist = [] for i in topFeatures: colname = processed_train_features.columns[i] colnamelist.append(colname) test = processed_test_features.copy() valid_for_bayes = processed_valid_features.copy() size = 188 test.drop(test.columns.difference(colnamelist), 1, inplace=True) valid_for_bayes.drop(valid_for_bayes.columns.difference(colnamelist), 1, inplace=True) model = GaussianNB() model.fit(test, test_labels) naive_bayes_predict_train_after_fisher = model.predict(test) print("Naive Bayes Testing accuracy ", accuracy_score(test_labels, naive_bayes_predict_train_after_fisher)) naive_bayes_predict_valid_after_fisher = model.predict(valid_for_bayes) print("Naive Bayes Validation accuracy", accuracy_score(valid_labels, naive_bayes_predict_valid_after_fisher))
def main(): # load data mat = scipy.io.loadmat("../data/COIL20.mat") X = mat["X"] # data X = X.astype(float) y = mat["Y"] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the score of each feature on the training set score = fisher_score.fisher_score(X[train], y[train]) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print "Accuracy:", float(correct) / 10
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the score of each feature on the training set score = fisher_score.fisher_score(X[train], y[train]) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print('Accuracy:', old_div(float(correct), 10))
def rank_features_using_fisherscore(cls, data_frame, target_key, cols_to_ignore=None): X = data_frame.values keys = list(data_frame.keys()) target_col_idx = keys.index(target_key) # Removing the target column from keys del keys[target_col_idx] # Remove all columns that are asked to be ignored if cols_to_ignore is not None: for col in cols_to_ignore: idx = keys.index(col) del keys[idx] Y = data_frame.loc[:, target_key].values X = data_frame.loc[:, keys] score = fisher_score.fisher_score(X, Y) rank = fisher_score.feature_ranking(score) ranked_features = [keys[i] for i in rank] return score, ranked_features, keys
def fischer_score_featureSelection(x, y): score = fisher_score.fisher_score(x, y) rank = score_to_rank(score) return rank
def fisher_score_FS(X_train, y_train): score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score) return (idx, score)
scores4 = np.append( scores4, abs(pearsonscore[0]) ) #absolute value because -1 or +1 represent perfect correlation g1 = lambda e: e[1] g10 = lambda e: e[1][0] R4, _ = zip(*sorted(enumerate(sorted(enumerate(-scores4), key=g1)), key=g10)) #print scores4 formatted_scores4 = ['%.2f' % elem for elem in scores4] print formatted_scores4 print R4 # ------------------------ Fisher Score ------------------------ print "Fisher Score:" scores5 = fisher_score.fisher_score(X, y) g1 = lambda e: e[1] g10 = lambda e: e[1][0] R5, _ = zip(*sorted(enumerate(sorted(enumerate(-scores5), key=g1)), key=g10)) #print scores5 formatted_scores5 = ['%.2f' % elem for elem in scores5] print formatted_scores5 print R5 # ------------------------ Relief-F ------------------------ print "Relief-F:" scores6 = reliefF.reliefF(X, y) g1 = lambda e: e[1]
def fit(self, X, y): idx = [] if self.tp == 'ITB': if self.name == 'MRMR': idx = MRMR.mrmr(X, y, n_selected_features=self.params['num_feats']) elif self.tp == 'filter': if self.name == 'Relief': score = reliefF.reliefF(X, y, k=self.params['k']) idx = reliefF.feature_ranking(score) if self.name == 'Fisher': # obtain the score of each feature on the training set score = fisher_score.fisher_score(X, y) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) if self.name == 'MI': idx = np.argsort( mutual_info_classif( X, y, n_neighbors=self.params['n_neighbors']))[::-1] elif self.tp == 'wrapper': model_fit = self.model.fit(X, y) model = SelectFromModel(model_fit, prefit=True) idx = model.get_support(indices=True) elif self.tp == 'SLB': # one-hot-encode on target y = construct_label_matrix(y) if self.name == 'SMBA': scba = fs.SCBA(data=X, alpha=self.params['alpha'], norm_type=self.params['norm_type'], verbose=self.params['verbose'], thr=self.params['thr'], max_iter=self.params['max_iter'], affine=self.params['affine'], normalize=self.params['normalize'], step=self.params['step'], PCA=self.params['PCA'], GPU=self.params['GPU'], device=self.params['device']) nrmInd, sInd, repInd, _ = scba.admm() if self.params['type_indices'] == 'nrmInd': idx = nrmInd elif self.params['type_indices'] == 'repInd': idx = repInd else: idx = sInd if self.name == 'RFS': W = RFS.rfs(X, y, gamma=self.params['gamma']) idx = feature_ranking(W) if self.name == 'll_l21': # obtain the feature weight matrix W, _, _ = ll_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'ls_l21': # obtain the feature weight matrix W, _, _ = ls_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'LASSO': LASSO = Lasso(alpha=self.params['alpha'], positive=True) y_pred_lasso = LASSO.fit(X, y) if y_pred_lasso.coef_.ndim == 1: coeff = y_pred_lasso.coef_ else: coeff = np.asarray(y_pred_lasso.coef_[0, :]) idx = np.argsort(-coeff) if self.name == 'EN': # elastic net L1 enet = ElasticNet(alpha=self.params['alpha'], l1_ratio=1, positive=True) y_pred_enet = enet.fit(X, y) if y_pred_enet.coef_.ndim == 1: coeff = y_pred_enet.coef_ else: coeff = np.asarray(y_pred_enet.coef_[0, :]) idx = np.argsort(-coeff) return idx
def fisher_score_selection(X, y): # Wrapping skfeature Fisher score. scores = fisher_score(X, y) return np.argsort(scores, 0)[::-1]
tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC in Cervical Cancer Data Before FS') plt.legend(loc="lower right") plt.show() #feature selection for train, test in cv.split(X, y): score = fisher_score.fisher_score(X[train], y[train]) # probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve print(len(score)) idx = fisher_score.feature_ranking(score) #print(idx) num_fea = 6 #Have to explain why the machine pick up those and do the classification again #X1 = ad[['NEK6','SLC2A4','SLC2A5','SUV_C34', 'SUVreduction']] #data.iloc[[0,3,6,24], [0,5,6]] X = pd.DataFrame(X) X1 = X.iloc[:, [ idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8], idx[9], idx[10], idx[11]
mean_pos=np.mean(positive_feaure,axis=0)#正类中,各特征的平均值 mean_neg=np.mean(negtive_feature,axis=0)#负类中,各样本的平均值 std_pos=np.std(positive_feaure,ddof=1,axis=0)#正类中各特征值的标准差 std_neg=np.std(negtive_feature,ddof=1,axis=0)#负类中各特征值的标准差 F_up=np.square(mean_pos-mean_feature)+np.square(mean_neg-mean_feature) F_down=np.square(std_pos)+np.square(std_neg) F_score=F_up/F_down """ #------------calculate the FS score with scikit-feature package--------------# from skfeature.function.similarity_based import fisher_score from skfeature.function.information_theoretical_based import MRMR from skfeature.function.similarity_based import reliefF from skfeature.function.statistical_based import gini_index Relief = reliefF.reliefF(datamat, labelmat) Fisher= fisher_score.fisher_score(datamat, labelmat) # mRMR,J,M,=MRMR.mrmr(datamat,labelmat,n_selected_features=80) # mRMR=-mRMR gini= gini_index.gini_index(datamat,labelmat) gini=-gini FSscore=np.column_stack((Relief,Fisher,gini))#合并三个分数 FSscore=ann.preprocess(FSscore) FinalScore=np.sum(FSscore,axis=1) FS=np.column_stack((FSscore,FinalScore)) FS_nor=ann.preprocess(FS)#将最后一列联合得分归一化 FS=pd.DataFrame(FS_nor,columns=["Relief", "Fisher","gini","FinalScore"],index=featurenames) # FS.to_csv("F:\Githubcode\AdaBoost\myown\FSscore.csv") sorteigen=FS.sort_values(by='FinalScore',ascending=False,axis=0)
"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X_train, **kwargs_W) score = lap_score.lap_score(X_train, W=W) idx = lap_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # fisher_score score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # reliefF score = reliefF.reliefF(X_train, y_train) idx = reliefF.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # chi_square
probas_ = clf.predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) acc = clf.score(X[test], y[test]) accs.append(acc) i += 1 print(sum(aucs) / float(len(aucs))) print(sum(accs) / float(len(accs))) #Fisher score score = fisher_score.fisher_score(X, y) #print(len(score)) idx = fisher_score.feature_ranking(score) #print(idx) num_fea = 6 X_resampled = pd.DataFrame(X_resampled) X1 = X_resampled.iloc[:, [ idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8], idx[9], idx[10], idx[11] ]] #X1 = X.iloc[:, [idx[0], idx[1], idx[2], idx[3], idx[4]]] X1 = pd.DataFrame(X1) #print("Selected features {}".format(X1.columns.values)) X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,
def fun_classify(inputFile, groupsSel, FeatSelect, Nfeats, scaleFeats=1): """ AllStatsMean, AllStatsSTD = fun_classify(inputFile, groupsSel, FeatSelect, Nfeats) inputFile: the .csv file containt feature tables groups: The selected groups to classify. Full set is ["S","F","Z","N","O"], but ["S","F","Z"] are of most interest for the article (ictal, inter-ictal and normal EEG) FeatSelect: feature selection method: PCA, RFE, fisher or none Nfeats: number of selected features Returns: AllStatsMean: mean performance values AllStatsSTD: standard deviation of performance values """ #reads input features dfFeats = pd.read_csv(inputFile, sep=',', header=0) #only selected groups dfFeats = dfFeats[dfFeats["Group"].isin(groupsSel)] if "decTaime" in dfFeats: x = dfFeats.iloc[:, 2:] #ignores decomposition method execution time else: x = dfFeats.iloc[:, 1:] y = dfFeats.iloc[:, 0].values if scaleFeats: #scale feats? x = StandardScaler().fit_transform(x) #Feature selection if x.shape[1] > Nfeats: #RFE if FeatSelect == "RFE": rfeModel = SVC(kernel="linear", C=0.025, probability=True, gamma='scale') rfeSelect = RFE(rfeModel, n_features_to_select=Nfeats) rfe_fit = rfeSelect.fit(x, y) x = x[:, rfe_fit.support_] if FeatSelect == "PCA": pca = PCA(n_components=Nfeats) x = pca.fit_transform(x) if FeatSelect == "fisher": fisherScore = fisher_score.fisher_score(x, y) idx = fisher_score.feature_ranking(fisherScore) x = x[:, idx[:Nfeats]] names = ["KNN", "Linear SVM", "RBF SVM", "GPC", "MLP"] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025, probability=True, gamma='scale'), SVC(probability=True, gamma='scale'), GaussianProcessClassifier(1.0 * RBF(1.0)), MLPClassifier(alpha=1, max_iter=200) ] #initialize performance variable AllStats = {} AllStatsMean = {} AllStatsSTD = {} for name in names: AllStats[name] = { "Accuracy": np.zeros([realizations, K_folds]), "SensitivityMean": np.zeros([realizations, K_folds]), "SpecificityMean": np.zeros([realizations, K_folds]), "AUC_Mean": np.zeros([realizations, K_folds]), "SensitivityIctal": np.zeros([realizations, K_folds]), "SpecificityIctal": np.zeros([realizations, K_folds]), "AUC_Ictal": np.zeros([realizations, K_folds]), "TTtimes": np.zeros([realizations, K_folds]) } AllStatsMean[name] = { "Accuracy": 0., "SensitivityMean": 0., "SpecificityMean": 0, "AUC_Mean": 0., "SensitivityIctal": 0., "SpecificityIctal": 0., "AUC_Ictal": 0., "TTtimes": 0. } AllStatsSTD[name] = { "Accuracy": 0., "SensitivityMean": 0., "SpecificityMean": 0, "AUC_Mean": 0., "SensitivityIctal": 0., "SpecificityIctal": 0., "AUC_Ictal": 0., "TTtimes": 0. } #for each realization for i in range(realizations): skf = StratifiedKFold(n_splits=K_folds, shuffle=True) #5-fold validation for tupTemp, ki in zip(skf.split(x, y), range(K_folds)): train_idx, test_idx = tupTemp[0], tupTemp[1] X_train, X_test = x[train_idx], x[test_idx] y_train, y_test = y[train_idx], y[test_idx] for name, clf in zip(names, classifiers): #for each classifier tic = time.time( ) #check training/testing time of each classifier #Fit model and predict modelFit = clf.fit(X_train, y_train) yPredicted = modelFit.predict(X_test) probsTest = modelFit.predict_proba(X_test) toc = time.time() # AUC - #ictal class as positive if len(np.unique(y)) > 2: AUCs = roc_auc_score( LabelBinarizer().fit_transform(y_test), probsTest, average=None) else: AUCs = roc_auc_score(y_test, probsTest[:, 1], average=None) #Sensitivity and Specificity cMatrix = confusion_matrix(y_test, yPredicted) FP = cMatrix.sum(axis=0) - np.diag(cMatrix) FN = cMatrix.sum(axis=1) - np.diag(cMatrix) TP = np.diag(cMatrix) TN = cMatrix.sum() - (FP + FN + TP) # Sensitivity TPR = TP / (TP + FN) # Specificity or true negative rate TNR = TN / (TN + FP) #fill performance variable AllStats[name]["Accuracy"][i, ki] = accuracy_score( y_test, yPredicted) AllStats[name]["SensitivityMean"][i, ki] = np.mean(TPR) AllStats[name]["SpecificityMean"][i, ki] = np.mean(TNR) AllStats[name]["SensitivityIctal"][i, ki] = TPR[0] AllStats[name]["SpecificityIctal"][i, ki] = TNR[0] AllStats[name]["AUC_Mean"][i, ki] = np.mean(AUCs) AllStats[name]["TTtimes"][i, ki] = toc - tic if len(np.unique(y)) > 2: AllStats[name]["AUC_Ictal"][i, ki] = AUCs[0] AllStatsDF = [0] * len(names) for idx, name in enumerate(names): for istat in AllStats[name].keys(): AllStats[name][istat] = np.mean(AllStats[name][istat], axis=1) AllStatsMean[name][istat] = np.mean(AllStats[name][istat]) AllStatsSTD[name][istat] = np.std(AllStats[name][istat]) AllStatsDF[idx] = pd.DataFrame.from_dict(AllStats[name]) AllStatsDF[idx]["Nmodes"] = Nmodes AllStatsDF[idx]["Classifier"] = name return pd.DataFrame.from_dict(AllStatsMean), pd.DataFrame.from_dict( AllStatsSTD), pd.concat(AllStatsDF)
#ReliefF score_rel = reliefF.reliefF(X_train, y_train) idx_rel = reliefF.feature_ranking(score_rel) #Laplacian score kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "k": 7, 't': 1, 'reliefF': True } W = construct_W.construct_W(X_train, **kwargs_W) score_lap = lap_score.lap_score(X_train, W=W) idx_lap = lap_score.feature_ranking(score_lap) #Fisher score_fish = fisher_score.fisher_score(X_train, y_train) print(score_fish) idx_fish = fisher_score.feature_ranking(score_fish) ###################################### Feature Integration idxM = idx_rel[:threshold] idxN = idx_lap[:threshold] idxO = idx_fish[:threshold] if combination_method == 1: #AND idx_and = reduce(np.intersect1d, (idxO, idxM, idxN)) idx = idx_and print("number of selectes features (bins) = ", idx.shape[0]) if combination_method == 2: #OR
# labels = train_df['TARGET'].values # data = train_df[feats].as_matrix() # only use training data for feature selection # klass = DiscreteMrmr # num_features = 50 # targets = labels.astype(bool) # variables = data.astype(float) # nrow, ncol = variables.shape # selector = klass(num_features, klass.MID, THRESHOLD) # # # b = time.time() # ui = None # maxrel, mrmr = selector._mrmr_selection(num_features, klass.MID, variables, targets, threshold=THRESHOLD, ui=ui) cut_X = pd.qcut(train_X, 20,labels=False, retbins=True) from skfeature.function.similarity_based import fisher_score score = fisher_score.fisher_score(train_X, train_Y) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import RFE from sklearn.svm import SVR bestFeat = SelectKBest() bestFeat.fit(train_X, train_Y) feat_scr = zip(feats,bestFeat.scores_) feat_scr = [f for f in feat_scr if not np.isnan(f[1])] sorted_fetas = sorted(feat_scr, key=lambda k:k[1], reverse=True) # estimator = SVR(kernel="linear") # selector = RFE(estimator, 5, step=1) # selector.fit(train_X, train_Y) # slow
def Fisher_Score(self): score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score)
#print("Features before feature selection: {}".format(X.columns.values)) #Get classes y_data = ad['Label'] y = pd.DataFrame(y_data) y = y.values.ravel() #Save the resmapling data into npy X_resampled = np.load('cervical_x.npy') y_resampled = np.load('Cervical_y.npy') X_resampled = pd.DataFrame(X_resampled) X_resampled.columns = X.columns.values cv = StratifiedKFold(n_splits=10) for train, test in cv.split(X_resampled, y_resampled): score = fisher_score.fisher_score(X_resampled.iloc[train], y_resampled[train]) #print(score) idx = fisher_score.feature_ranking(score) #X1 = X_resampled.iloc[:, [idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8], idx[9], idx[10], idx[11]]] X1 = X_resampled.iloc[:, idx[0:11]] #print(X_resampled.columns.values) X_resampled = X1 print("Selected Features in Fisher{}".format(X_resampled.columns)) #Cross Validation #Decision Tree
def my_fisher_score(X, y): return fisher_score(copy.deepcopy(X), y.flatten())
# labels = train_df['TARGET'].values # data = train_df[feats].as_matrix() # only use training data for feature selection # klass = DiscreteMrmr # num_features = 50 # targets = labels.astype(bool) # variables = data.astype(float) # nrow, ncol = variables.shape # selector = klass(num_features, klass.MID, THRESHOLD) # # # b = time.time() # ui = None # maxrel, mrmr = selector._mrmr_selection(num_features, klass.MID, variables, targets, threshold=THRESHOLD, ui=ui) cut_X = pd.qcut(train_X, 20, labels=False, retbins=True) from skfeature.function.similarity_based import fisher_score score = fisher_score.fisher_score(train_X, train_Y) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import RFE from sklearn.svm import SVR bestFeat = SelectKBest() bestFeat.fit(train_X, train_Y) feat_scr = zip(feats, bestFeat.scores_) feat_scr = [f for f in feat_scr if not np.isnan(f[1])] sorted_fetas = sorted(feat_scr, key=lambda k: k[1], reverse=True) # estimator = SVR(kernel="linear") # selector = RFE(estimator, 5, step=1) # selector.fit(train_X, train_Y) # slow