def select_fdr(input_data, feature_names=None, score_func=f_classif, alpha=0.05): if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) done = False increment = alpha while not done: feature_selector = SelectFdr(score_func=score_func, alpha=alpha) temp_x_train = feature_selector.fit_transform(x_train, y_train) temp_x_test = feature_selector.transform(x_test) if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1: done = True x_train = temp_x_train x_test = temp_x_test else: msg = 'Feature selection was too aggresive, ' msg += 'increasing alpha from {} to {}'.format( alpha, alpha + increment) alpha += increment logging.warning(msg) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] logging.info('Selected {} features'.format(x_train.shape[1])) final_args = {'score_func': score_func, 'alpha': alpha} return output_data, feature_names, final_args
def featureFitting( filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05,model=None): ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" ''' a=alpha FD = SelectFdr(alpha=a) X = FD.fit_transform(X,y) selectK = SelectKBest(k=kbest) selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = featureNames[selectK_mask] print("K_featnames: %s" %(K_featnames)) Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') return Reduced_df
import pandas as pd #importing the dataset dataset = pd.read_csv('regressionDataSet.csv') x = dataset.iloc[:, 1:].values y = dataset.iloc[:, 0].values #feature selector 1 from sklearn.feature_selection import SelectKBest fs1 = SelectKBest(k=5) x_new1 = fs1.fit_transform(x, y) #feature selector 2 from sklearn.feature_selection import SelectFdr fs2 = SelectFdr() x_new2 = fs2.fit_transform(x, y) #feature selector 3 from sklearn.linear_model import LinearRegression estimator = LinearRegression() from sklearn.feature_selection import RFE fs3 = RFE(estimator, 5) x_new3 = fs3.fit_transform(x, y) #feature selector 4 from sklearn.feature_selection import SelectFromModel fs4 = SelectFromModel(estimator) x_new4 = fs4.fit_transform(x, y) #feature selector 5 from sklearn.feature_selection import SelectFwe
t_2 = 0.05 vt_1 = VarianceThreshold(threshold=t_1) vt_2 = VarianceThreshold(threshold=t_2) X_1 = vt_1.fit_transform(X) X_2 = vt_2.fit_transform(X) print(f'{name}: Variance threshold={t_1}, Number of features: {X_1.shape[1]}') print(f'{name}: Variance threshold={t_2}, Number of features: {X_2.shape[1]}') # ========================= # Univariate selection with stat test # ========================= fdr = SelectFdr() X_fdr = fdr.fit_transform(X, Y) print(f'{name}: FDR, Number of features: {X_fdr.shape[1]}') # ========================= # L1 Based # ========================= # Linear Lasso # Attention ici le sujet donne un algo de régression linéaire pas de régression logistique ! alphas = np.linspace(0.01, 1, 1000) scores = [] features_number = [] for alpha in alphas: clf = linear_model.LogisticRegression(penalty='l1', C=alpha, solver='liblinear') clf.fit(X, Y)
y= np.zeros(Y.shape) class_names=list(np.unique(Y)) class_num=0 number_of_classes=np.unique(Y).shape[0] for classes in np.unique(Y): y[Y==classes]=int(class_num) print('Class '+ classes + ': ' + str(class_num)) class_num=class_num+1 X = StandardScaler().fit_transform(X) #### for anaova #X = MinMaxScaler().fit_transform(X) #### for Chi2 ## Select features fdr = SelectFdr(f_classif,alpha=0.005) #### for anaova #fdr = SelectFdr(chi2,alpha=0.05) #### for Chi2 X_sel = fdr.fit_transform(X,y) idx_sorted = fdr.get_support(indices = True) fdr_select_features = list( feature_set[i] for i in idx_sorted) print ('Selected features with FDR: ') print (fdr_select_features) print ('\n') print (X.shape) print (X_sel.shape) X_new = df[fdr_select_features].values Y=(df['Class']) le = preprocessing.LabelEncoder() y=le.fit_transform(Y)
def extract_features_fdr(self, file_name, N=-1, alpha=5e-2, direction=False, allow_subseq=True, binarization=True, remove_redundant_markers=True): ''' Feature extraction with fdr-correction ''' # https://brainder.org/2011/09/05/fdr-corrected-fdr-adjusted-p-values/ # Filter: Select the p-values for an estimated false discovery rate # This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate. selector = SelectFdr(chi2, alpha=alpha) if binarization=='median': median_vec=np.median(self.X.toarray(),axis=0) X=np.zeros(self.X.shape) X[np.where(self.X.toarray()>np.median(self.X.toarray(),axis=0))]=1 X=csr_matrix(X) elif binarization: X=self.X.toarray() X[np.where(X>0)]=1 X=csr_matrix(X) else: X=self.X #if remove_redundant_markers: # dist=get_kl_rows(X.T) # dist=dist+dist.T selector.fit_transform(X, self.Y) scores = {self.feature_names[i]: (s, selector.pvalues_[i]) for i, s in enumerate(list(selector.scores_)) if not math.isnan(s)} f = codecs.open(file_name, 'w') c_1 = np.sum(self.Y) c_0 =np.sum([1 for x in self.Y if x==0]) f.write('\t'.join(['Motif', 'Chi2-score', 'p-value']) + '\n') X = X.toarray() pos_scores = [] new_scores={} for w, score in scores.items(): if score[1] < 0.05: feature_array = X[:, self.feature_names.index(w)] pos = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 1] neg = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 0] m_pos=np.mean(pos) s_pos=np.std(pos) m_neg=np.mean(neg) s_neg=np.std(neg) c11 = np.sum(pos) c01 = c_1 - c11 c10 = np.sum(neg) c00 = c_0 - c10 s=score[0] if direction and c11 > ((1.0 * c11) * c00 - (c10 * 1.0) * c01): s=-s if s>0 and len(w)<25: new_scores[w]=score if N==-1: scores = sorted(new_scores.items(), key=operator.itemgetter([1][0]), reverse=True) else: scores = sorted(new_scores.items(), key=operator.itemgetter([1][0]), reverse=True)[0:N] extracted_features=[] for w, score in scores: if score[1] < 0.05: feature_array = X[:, self.feature_names.index(w)] pos = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 1] neg = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 0] m_pos=np.mean(pos) s_pos=np.std(pos) m_neg=np.mean(neg) s_neg=np.std(neg) c11 = np.sum(pos) c01 = c_1 - c11 c10 = np.sum(neg) c00 = c_0 - c10 s=score[0] if direction and c11 > ((1.0 * c11) * c00 - (c10 * 1.0) * c01): s=-s s=np.round(s,2) if allow_subseq: pos_scores.append([str(w), s, score[1], m_pos, m_neg]) #if m_pos> m_neg: f.write('\t'.join([str(w), str(s), str(score[1])]) + '\n') else: flag=False for feature in extracted_features: if w in feature: flag=True if not flag: pos_scores.append([str(w), s, score[1], m_pos, m_neg]) f.write('\t'.join([str(w), str(s), str(score[1])]) + '\n') f.close() return pos_scores
y = np.zeros(Y.shape) class_names = list(np.unique(Y)) class_num = 0 number_of_classes = np.unique(Y).shape[0] for classes in np.unique(Y): y[Y == classes] = int(class_num) print('Class ' + classes + ': ' + str(class_num)) class_num = class_num + 1 X = StandardScaler().fit_transform(X) #### for anaova #X = MinMaxScaler().fit_transform(X) #### for Chi2 ## Select features fdr = SelectFdr(f_classif, alpha=0.005) #### for anaova #fdr = SelectFdr(chi2,alpha=0.05) #### for Chi2 X_select = fdr.fit_transform(X, y) #####to select top 100 features############ X_select = SelectKBest(f_classif, k=100).fit_transform(X_select, y) #X_select = SelectKBest(chi2,k=100).fit_transform(X_select,y) idx_sorted = fdr.get_support(indices=True) pvals = fdr.pvalues_ pscores = fdr.scores_ print(X.shape) print(X_select.shape) select_features = list(feature_set[i] for i in idx_sorted) print('Selected features: ') print(select_features)
fileName = r'\trainingSetFeatures.csv' # filePath = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap' filePath = str(input('Input DIRRectory containing TrainingData csv ')) ## features, labels, lb_encoder,featureNames = load_data(filename, 'file') features, labels, lb_encoder,featureNames = load_data(filePath+fileName, 'file') X, y = features, labels print('len(set(y)',len(set(y))) print(X.shape,"X = samples, features") scale = StandardScaler(copy=False) X = scale.fit_transform(X) FD = SelectFdr(alpha=0.0005) FD_K = SelectPercentile(percentile=70) X = FD.fit_transform(X,y) print(X.shape,"X post FDR alpha filter") X_FD = FD_K.fit_transform(X,y) print(X_FD.shape,"X post FDR+K-best alpha filter") print("\n BASE X models: \n") ModelParam_GridSearch(X,y,cv=Kcv) ''' pca = PCA(n_components='mle') X_PCA = pca.fit_transform(X) print(X_PCA.shape,"X - PCA,mle") ModelParam_GridSearch(X_PCA,y,cv=Kcv) '''
def extract_features_fdr(self, file_name, N=-1, alpha=5e-2, direction=True, allow_subseq=True, binarization=True): ''' :param file_name: output Chi2 :param N: Top-N significant features :param alpha: the min p-value :param direction: if true the score would have sign :param allow_subseq: :param binarization: if the data is not binary ==> 'binary','median' :return: ''' ''' Feature extraction with fdr-correction ''' # https://brainder.org/2011/09/05/fdr-corrected-fdr-adjusted-p-values/ # Filter: Select the p-values for an estimated false discovery rate # This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate. selector = SelectFdr(chi2, alpha=alpha) if binarization == 'median': median_vec = np.median(self.X.toarray(), axis=0) X = np.zeros(self.X.shape) X[np.where( self.X.toarray() > np.median(self.X.toarray(), axis=0))] = 1 X = csr_matrix(X) elif binarization: X = self.X.toarray() X[np.where(X > 0)] = 1 X = csr_matrix(X) else: X = self.X selector.fit_transform(X, self.Y) scores = { self.feature_names[i]: (s, selector.pvalues_[i]) for i, s in enumerate(list(selector.scores_)) if not math.isnan(s) } if N == -1: scores = sorted(scores.items(), key=operator.itemgetter([1][0]), reverse=True) else: scores = sorted(scores.items(), key=operator.itemgetter([1][0]), reverse=True)[0:N] f = codecs.open(file_name, 'w') c_1 = np.sum(self.Y) c_0 = np.sum([1 for x in self.Y if x == 0]) f.write('\t'.join( ['feature', 'score', 'p-value', 'mean+-pos', 'mean+-neg']) + '\n') X = X.toarray() pos_scores = [] extracted_features = [] for w, score in scores: feature_array = X[:, self.feature_names.index(w)] pos = [ feature_array[idx] for idx, x in enumerate(self.Y) if x == 1 ] neg = [ feature_array[idx] for idx, x in enumerate(self.Y) if x == 0 ] m_pos = np.mean(pos) s_pos = np.std(pos) m_neg = np.mean(neg) s_neg = np.std(neg) c11 = np.sum(pos) c01 = c_1 - c11 c10 = np.sum(neg) c00 = c_0 - c10 s = score[0] if direction and c11 > ((1.0 * c11) * c00 - (c10 * 1.0) * c01): s = -s s = np.round(s, 2) if allow_subseq: pos_scores.append([str(w), s, score[1], m_pos, m_neg]) #if m_pos> m_neg: f.write( '\t'.join([str(w), str(s), str(score[1])] + [str(x) for x in [m_pos, s_pos, m_neg, s_neg]]) + '\n') else: flag = False for feature in extracted_features: if w in feature: flag = True if not flag: pos_scores.append([str(w), s, score[1], m_pos, m_neg]) f.write('\t'.join( [str(w), str(s), str(score[1])] + [str(x) for x in [m_pos, s_pos, m_neg, s_neg]]) + '\n') f.close() return pos_scores