def dimensionality_reduction(X_train, X_test, y_train, n_features, method): if method == "ReliefF": #produção do vetor de scores que será utilizado para seleção dos atributos. score = reliefF.reliefF(X_train, y_train) #indice dos atributos de acordo com o ranking feito pelo score. index = reliefF.feature_ranking(score) #atribuição das n_features que agora serão utilizadas em X_train e X_test X_train = X_train[:, index[0:n_features]] X_test = X_test[:, index[0:n_features]] elif method == "LDA": # Applying LDA lda = LDA(n_components=n_features) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) elif method == "PCA": # Applying PCA pca = PCA(n_components=n_features) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) #explained_variance = pca.explained_variance_ratio_ elif method == "KernelPCA": # Applying Kernel PCA kpca = KernelPCA(n_components=n_features, kernel='rbf') X_train = kpca.fit_transform(X_train) X_test = kpca.transform(X_test) return (X_train, X_test)
def relieF(data): rank = [] for i in range(6): X = data[i][:, :-1] Y = data[i][:, -1] score = reliefF.reliefF(X, Y) idx1 = reliefF.feature_ranking(score) idx = samp(idx1.tolist()) rank.append(idx) m = agg.instant_runoff(rank) R = [int(i) for i in m] return R
def run_fold(trial,P,X,y,method,dataset,parttype): print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial) n_samples, n_features = X.shape train = P[:,trial] == 1 trnX = X[train] trnY = y[train] start_time = time.time() if method == 'fisher': score = fisher_score.fisher_score(trnX,trnY) features = fisher_score.feature_ranking(score) elif method == 'chi2': score = chi_square.chi_square(trnX,trnY) features = chi_square.feature_ranking(score) elif method == 'relieff': score = reliefF.reliefF(trnX,trnY) features = reliefF.feature_ranking(score) elif method == 'jmi': features = JMI.jmi(trnX,trnY, n_selected_features=n_features) elif method == 'mrmr': features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features) elif method == 'infogain': features = MIM.mim(trnX,trnY,n_selected_features=n_features) elif method == 'svmrfe': features = svmrfe(trnX,trnY) elif method == 'hdmr': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) elif method == 'hdmrhaar': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) else: print(method + 'does no exist') cputime = time.time() - start_time print features print 'cputime %f' % cputime return {'features': features, 'cputime': cputime}
def main(): # load data mat = scipy.io.loadmat('../data/nci9.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds #ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) ss = LeaveOneOut() # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC(random_state=42) # linear SVM score = reliefF.reliefF(X, y) idx = reliefF.feature_ranking(score) selected_features = X[:, idx[0:num_fea]] correct = 0 y_pred = [] for train, test in ss.split(X): # obtain the score of each feature on the training set #score = reliefF.reliefF(X[train], y[train]) # rank features in descending order according to score #idx = reliefF.feature_ranking(score) # obtain the dataset on the selected features # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data #y_predict = clf.predict(selected_features[test]) y_pred.append(clf.predict(selected_features[test])) # obtain the classification accuracy on the test data #acc = accuracy_score(y[test], y_predict) #correct = correct + acc # output the average classification accuracy over all 10 folds #print('Accuracy:', float(correct)/10) print(accuracy_score(y, y_pred))
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 10 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the score of each feature on the training set score = reliefF.reliefF(X[train], y[train]) # rank features in descending order according to score idx = reliefF.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] print('num:', num_fea) print('selected_fs:', idx) # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print('Accuracy:', float(correct) / 10)
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the score of each feature on the training set score = reliefF.reliefF(X[train], y[train]) # rank features in descending order according to score idx = reliefF.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print 'Accuracy:', float(correct)/10
selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # fisher_score score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # reliefF score = reliefF.reliefF(X_train, y_train) idx = reliefF.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # chi_square score = chi_square.chi_square(np.abs(X_train), y_train) idx = chi_square.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # pca pca = PCA(n_components=num_features)
def relief_FS(X_train, y_train): score = reliefF.reliefF(X_train, y_train) idx = reliefF.feature_ranking(score) return (idx, score)
def relief_FS(X_train, y_train): #n_samples, n_features = X.shape score = reliefF.reliefF(X_train, y_train) idx = reliefF.feature_ranking(score) return (idx, score)
sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) num_pip = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) X_train = num_pip.fit_transform(X_train) X_test = num_pip.transform(X_test) print('fs') ########################### Apply Feature Selection methods :ReliefF, Laplacian score & Fisher #ReliefF score_rel = reliefF.reliefF(X_train, y_train) idx_rel = reliefF.feature_ranking(score_rel) #Laplacian score kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "k": 7, 't': 1, 'reliefF': True } W = construct_W.construct_W(X_train, **kwargs_W) score_lap = lap_score.lap_score(X_train, W=W) idx_lap = lap_score.feature_ranking(score_lap) #Fisher score_fish = fisher_score.fisher_score(X_train, y_train) print(score_fish) idx_fish = fisher_score.feature_ranking(score_fish)
def fit(self, X, y): idx = [] if self.tp == 'ITB': if self.name == 'MRMR': idx = MRMR.mrmr(X, y, n_selected_features=self.params['num_feats']) elif self.tp == 'filter': if self.name == 'Relief': score = reliefF.reliefF(X, y, k=self.params['k']) idx = reliefF.feature_ranking(score) if self.name == 'Fisher': # obtain the score of each feature on the training set score = fisher_score.fisher_score(X, y) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) if self.name == 'MI': idx = np.argsort( mutual_info_classif( X, y, n_neighbors=self.params['n_neighbors']))[::-1] elif self.tp == 'wrapper': model_fit = self.model.fit(X, y) model = SelectFromModel(model_fit, prefit=True) idx = model.get_support(indices=True) elif self.tp == 'SLB': # one-hot-encode on target y = construct_label_matrix(y) if self.name == 'SMBA': scba = fs.SCBA(data=X, alpha=self.params['alpha'], norm_type=self.params['norm_type'], verbose=self.params['verbose'], thr=self.params['thr'], max_iter=self.params['max_iter'], affine=self.params['affine'], normalize=self.params['normalize'], step=self.params['step'], PCA=self.params['PCA'], GPU=self.params['GPU'], device=self.params['device']) nrmInd, sInd, repInd, _ = scba.admm() if self.params['type_indices'] == 'nrmInd': idx = nrmInd elif self.params['type_indices'] == 'repInd': idx = repInd else: idx = sInd if self.name == 'RFS': W = RFS.rfs(X, y, gamma=self.params['gamma']) idx = feature_ranking(W) if self.name == 'll_l21': # obtain the feature weight matrix W, _, _ = ll_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'ls_l21': # obtain the feature weight matrix W, _, _ = ls_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'LASSO': LASSO = Lasso(alpha=self.params['alpha'], positive=True) y_pred_lasso = LASSO.fit(X, y) if y_pred_lasso.coef_.ndim == 1: coeff = y_pred_lasso.coef_ else: coeff = np.asarray(y_pred_lasso.coef_[0, :]) idx = np.argsort(-coeff) if self.name == 'EN': # elastic net L1 enet = ElasticNet(alpha=self.params['alpha'], l1_ratio=1, positive=True) y_pred_enet = enet.fit(X, y) if y_pred_enet.coef_.ndim == 1: coeff = y_pred_enet.coef_ else: coeff = np.asarray(y_pred_enet.coef_[0, :]) idx = np.argsort(-coeff) return idx
def fit(self, X, y): if self.name == 'LASSO': # print self.params['alpha'] LASSO = Lasso(alpha=self.params['alpha'], positive=True) y_pred_lasso = LASSO.fit(X, y) if y_pred_lasso.coef_.ndim == 1: coeff = y_pred_lasso.coef_ else: coeff = np.asarray(y_pred_lasso.coef_[0, :]) idx = np.argsort(-coeff) if self.name == 'EN': # elastic net L1 # alpha = self.params['alpha'] # alpha = .9 - ((self.params['alpha'] - 1.0) * (1 - 0.1)) / ((50 - 1) + 0.1) # print alpha enet = ElasticNet(alpha=self.params['alpha'], l1_ratio=1, positive=True) y_pred_enet = enet.fit(X, y) # if y_pred_enet.coef_ if y_pred_enet.coef_.ndim == 1: coeff = y_pred_enet.coef_ else: coeff = np.asarray(y_pred_enet.coef_[0, :]) idx = np.argsort(-coeff) if self.name == 'RFS': W = RFS.rfs(X, construct_label_matrix(y), gamma=self.params['gamma']) idx = feature_ranking(W) if self.name == 'll_l21': # obtain the feature weight matrix W, _, _ = ll_l21.proximal_gradient_descent( X, construct_label_matrix(y), z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'ls_l21': # obtain the feature weight matrix W, _, _ = ls_l21.proximal_gradient_descent( X, construct_label_matrix(y), z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.tp == 'ITB': if self.name == 'MRMR': idx = MRMR.mrmr(X, y, n_selected_features=self.params['num_feats']) if self.name == 'Relief': score = reliefF.reliefF(X, y, k=self.params['k']) idx = reliefF.feature_ranking(score) if self.name == 'MI': idx = np.argsort( mutual_info_classif( X, y, n_neighbors=self.params['n_neighbors']))[::-1] return idx