示例#1
0
def select_fdr(input_data,
               feature_names=None,
               score_func=f_classif,
               alpha=0.05):
    if score_func == f_classif:
        input_data, feature_names, _ = remove_constant(input_data,
                                                       feature_names)

    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)

    done = False
    increment = alpha
    while not done:
        feature_selector = SelectFdr(score_func=score_func, alpha=alpha)
        temp_x_train = feature_selector.fit_transform(x_train, y_train)
        temp_x_test = feature_selector.transform(x_test)
        if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1:
            done = True
            x_train = temp_x_train
            x_test = temp_x_test
        else:
            msg = 'Feature selection was too aggresive, '
            msg += 'increasing alpha from {} to {}'.format(
                alpha, alpha + increment)
            alpha += increment
            logging.warning(msg)

    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)
    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    logging.info('Selected {} features'.format(x_train.shape[1]))

    final_args = {'score_func': score_func, 'alpha': alpha}

    return output_data, feature_names, final_args
示例#2
0
def featureFitting( filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05,model=None):
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    a=alpha
    FD = SelectFdr(alpha=a)
    X = FD.fit_transform(X,y)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print("K_featnames: %s" %(K_featnames))
    Reduced_df = pd.read_csv(filename, index_col=0)
    Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
    Reduced_df.to_csv('REDUCED_Feat.csv')
    return Reduced_df
示例#3
0
import pandas as pd

#importing the dataset
dataset = pd.read_csv('regressionDataSet.csv')
x = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

#feature selector 1
from sklearn.feature_selection import SelectKBest
fs1 = SelectKBest(k=5)
x_new1 = fs1.fit_transform(x, y)

#feature selector 2
from sklearn.feature_selection import SelectFdr
fs2 = SelectFdr()
x_new2 = fs2.fit_transform(x, y)

#feature selector 3
from sklearn.linear_model import LinearRegression
estimator = LinearRegression()
from sklearn.feature_selection import RFE
fs3 = RFE(estimator, 5)
x_new3 = fs3.fit_transform(x, y)

#feature selector 4
from sklearn.feature_selection import SelectFromModel
fs4 = SelectFromModel(estimator)
x_new4 = fs4.fit_transform(x, y)

#feature selector 5
from sklearn.feature_selection import SelectFwe
示例#4
0
    t_2 = 0.05

    vt_1 = VarianceThreshold(threshold=t_1)
    vt_2 = VarianceThreshold(threshold=t_2)
    X_1 = vt_1.fit_transform(X)
    X_2 = vt_2.fit_transform(X)

    print(f'{name}: Variance threshold={t_1}, Number of features: {X_1.shape[1]}')
    print(f'{name}: Variance threshold={t_2}, Number of features: {X_2.shape[1]}')

    # =========================
    # Univariate selection with stat test
    # =========================

    fdr = SelectFdr()
    X_fdr = fdr.fit_transform(X, Y)

    print(f'{name}: FDR, Number of features: {X_fdr.shape[1]}')

    # =========================
    # L1 Based
    # =========================

    # Linear Lasso
    # Attention ici le sujet donne un algo de régression linéaire pas de régression logistique !
    alphas = np.linspace(0.01, 1, 1000)
    scores = []
    features_number = []
    for alpha in alphas:
        clf = linear_model.LogisticRegression(penalty='l1', C=alpha, solver='liblinear')
        clf.fit(X, Y)
y= np.zeros(Y.shape)
class_names=list(np.unique(Y))
class_num=0
number_of_classes=np.unique(Y).shape[0]
for classes in np.unique(Y):
    y[Y==classes]=int(class_num)
    print('Class '+ classes + ': ' + str(class_num))
    class_num=class_num+1

X = StandardScaler().fit_transform(X) #### for anaova
#X = MinMaxScaler().fit_transform(X) #### for Chi2

## Select features
fdr = SelectFdr(f_classif,alpha=0.005) #### for anaova
#fdr = SelectFdr(chi2,alpha=0.05) #### for Chi2
X_sel = fdr.fit_transform(X,y)
idx_sorted = fdr.get_support(indices = True)
fdr_select_features = list( feature_set[i] for i in idx_sorted)
print ('Selected features with FDR: ')
print  (fdr_select_features)
print ('\n')
print (X.shape)
print (X_sel.shape)

X_new = df[fdr_select_features].values
Y=(df['Class'])

le = preprocessing.LabelEncoder()
y=le.fit_transform(Y)

示例#6
0
    def extract_features_fdr(self, file_name, N=-1, alpha=5e-2, direction=False, allow_subseq=True, binarization=True, remove_redundant_markers=True):
        '''
            Feature extraction with fdr-correction
        '''
        # https://brainder.org/2011/09/05/fdr-corrected-fdr-adjusted-p-values/
        # Filter: Select the p-values for an estimated false discovery rate
        # This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate.
        selector = SelectFdr(chi2, alpha=alpha)


        if binarization=='median':
            median_vec=np.median(self.X.toarray(),axis=0)
            X=np.zeros(self.X.shape)
            X[np.where(self.X.toarray()>np.median(self.X.toarray(),axis=0))]=1
            X=csr_matrix(X)
        elif binarization:
            X=self.X.toarray()
            X[np.where(X>0)]=1
            X=csr_matrix(X)
        else:
            X=self.X

        #if remove_redundant_markers:
        #    dist=get_kl_rows(X.T)
        #    dist=dist+dist.T


        selector.fit_transform(X, self.Y)
        scores = {self.feature_names[i]: (s, selector.pvalues_[i]) for i, s in enumerate(list(selector.scores_)) if
                  not math.isnan(s)}


        f = codecs.open(file_name, 'w')
        c_1 = np.sum(self.Y)
        c_0 =np.sum([1 for x in self.Y if x==0])
        f.write('\t'.join(['Motif', 'Chi2-score', 'p-value']) + '\n')
        X = X.toarray()
        pos_scores = []

        new_scores={}
        for w, score in scores.items():
            if score[1] < 0.05:
                feature_array = X[:, self.feature_names.index(w)]
                pos = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 1]
                neg = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 0]
                m_pos=np.mean(pos)
                s_pos=np.std(pos)
                m_neg=np.mean(neg)
                s_neg=np.std(neg)

                c11 = np.sum(pos)
                c01 = c_1 - c11
                c10 = np.sum(neg)
                c00 = c_0 - c10
                s=score[0]
                if direction and c11 > ((1.0 * c11) * c00 - (c10 * 1.0) * c01):
                    s=-s
                if s>0 and len(w)<25:
                    new_scores[w]=score

        if N==-1:
            scores = sorted(new_scores.items(), key=operator.itemgetter([1][0]), reverse=True)
        else:
            scores = sorted(new_scores.items(), key=operator.itemgetter([1][0]), reverse=True)[0:N]

        extracted_features=[]

        for w, score in scores:
            if score[1] < 0.05:
                feature_array = X[:, self.feature_names.index(w)]
                pos = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 1]
                neg = [feature_array[idx] for idx, x in enumerate(self.Y) if x == 0]
                m_pos=np.mean(pos)
                s_pos=np.std(pos)
                m_neg=np.mean(neg)
                s_neg=np.std(neg)

                c11 = np.sum(pos)
                c01 = c_1 - c11
                c10 = np.sum(neg)
                c00 = c_0 - c10
                s=score[0]
                if direction and c11 > ((1.0 * c11) * c00 - (c10 * 1.0) * c01):
                    s=-s
                s=np.round(s,2)

                if allow_subseq:
                    pos_scores.append([str(w), s, score[1], m_pos, m_neg])
                    #if m_pos> m_neg:
                    f.write('\t'.join([str(w), str(s), str(score[1])]) + '\n')
                else:
                    flag=False
                    for feature in extracted_features:
                        if w in feature:
                            flag=True
                    if not flag:
                        pos_scores.append([str(w), s, score[1], m_pos, m_neg])
                        f.write('\t'.join([str(w), str(s), str(score[1])]) + '\n')

        f.close()
        return pos_scores
y = np.zeros(Y.shape)
class_names = list(np.unique(Y))
class_num = 0
number_of_classes = np.unique(Y).shape[0]
for classes in np.unique(Y):
    y[Y == classes] = int(class_num)
    print('Class ' + classes + ': ' + str(class_num))
    class_num = class_num + 1

X = StandardScaler().fit_transform(X)  #### for anaova
#X = MinMaxScaler().fit_transform(X) #### for Chi2

## Select features
fdr = SelectFdr(f_classif, alpha=0.005)  #### for anaova
#fdr = SelectFdr(chi2,alpha=0.05) #### for Chi2
X_select = fdr.fit_transform(X, y)

#####to select top 100 features############
X_select = SelectKBest(f_classif, k=100).fit_transform(X_select, y)
#X_select = SelectKBest(chi2,k=100).fit_transform(X_select,y)

idx_sorted = fdr.get_support(indices=True)
pvals = fdr.pvalues_
pscores = fdr.scores_

print(X.shape)
print(X_select.shape)

select_features = list(feature_set[i] for i in idx_sorted)
print('Selected features: ')
print(select_features)
示例#8
0
    fileName = r'\trainingSetFeatures.csv'

    # filePath = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap'
    filePath = str(input('Input DIRRectory containing TrainingData csv '))

    ## features, labels, lb_encoder,featureNames = load_data(filename, 'file')
    features, labels, lb_encoder,featureNames = load_data(filePath+fileName, 'file')

    X, y = features, labels
    print('len(set(y)',len(set(y)))
    print(X.shape,"X = samples, features")
    scale = StandardScaler(copy=False)
    X = scale.fit_transform(X)

    FD = SelectFdr(alpha=0.0005)
    FD_K = SelectPercentile(percentile=70)
    X = FD.fit_transform(X,y)
    print(X.shape,"X post FDR alpha filter")
    X_FD = FD_K.fit_transform(X,y)
    print(X_FD.shape,"X post FDR+K-best alpha filter")

    print("\n BASE X models: \n")
    ModelParam_GridSearch(X,y,cv=Kcv)
    '''
    pca = PCA(n_components='mle')
    X_PCA = pca.fit_transform(X)
    print(X_PCA.shape,"X - PCA,mle")
    ModelParam_GridSearch(X_PCA,y,cv=Kcv)
    '''

示例#9
0
    def extract_features_fdr(self,
                             file_name,
                             N=-1,
                             alpha=5e-2,
                             direction=True,
                             allow_subseq=True,
                             binarization=True):
        '''
        :param file_name: output Chi2
        :param N: Top-N significant features
        :param alpha: the min p-value
        :param direction: if true the score would have sign
        :param allow_subseq:
        :param binarization: if the data is not binary ==> 'binary','median'
        :return:
        '''
        '''
            Feature extraction with fdr-correction
        '''
        # https://brainder.org/2011/09/05/fdr-corrected-fdr-adjusted-p-values/
        # Filter: Select the p-values for an estimated false discovery rate
        # This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate.
        selector = SelectFdr(chi2, alpha=alpha)

        if binarization == 'median':
            median_vec = np.median(self.X.toarray(), axis=0)
            X = np.zeros(self.X.shape)
            X[np.where(
                self.X.toarray() > np.median(self.X.toarray(), axis=0))] = 1
            X = csr_matrix(X)
        elif binarization:
            X = self.X.toarray()
            X[np.where(X > 0)] = 1
            X = csr_matrix(X)
        else:
            X = self.X

        selector.fit_transform(X, self.Y)
        scores = {
            self.feature_names[i]: (s, selector.pvalues_[i])
            for i, s in enumerate(list(selector.scores_)) if not math.isnan(s)
        }
        if N == -1:
            scores = sorted(scores.items(),
                            key=operator.itemgetter([1][0]),
                            reverse=True)
        else:
            scores = sorted(scores.items(),
                            key=operator.itemgetter([1][0]),
                            reverse=True)[0:N]

        f = codecs.open(file_name, 'w')
        c_1 = np.sum(self.Y)
        c_0 = np.sum([1 for x in self.Y if x == 0])
        f.write('\t'.join(
            ['feature', 'score', 'p-value', 'mean+-pos', 'mean+-neg']) + '\n')
        X = X.toarray()
        pos_scores = []

        extracted_features = []
        for w, score in scores:
            feature_array = X[:, self.feature_names.index(w)]
            pos = [
                feature_array[idx] for idx, x in enumerate(self.Y) if x == 1
            ]
            neg = [
                feature_array[idx] for idx, x in enumerate(self.Y) if x == 0
            ]
            m_pos = np.mean(pos)
            s_pos = np.std(pos)
            m_neg = np.mean(neg)
            s_neg = np.std(neg)

            c11 = np.sum(pos)
            c01 = c_1 - c11
            c10 = np.sum(neg)
            c00 = c_0 - c10
            s = score[0]
            if direction and c11 > ((1.0 * c11) * c00 - (c10 * 1.0) * c01):
                s = -s
            s = np.round(s, 2)

            if allow_subseq:
                pos_scores.append([str(w), s, score[1], m_pos, m_neg])
                #if m_pos> m_neg:
                f.write(
                    '\t'.join([str(w), str(s), str(score[1])] +
                              [str(x)
                               for x in [m_pos, s_pos, m_neg, s_neg]]) + '\n')
            else:
                flag = False
                for feature in extracted_features:
                    if w in feature:
                        flag = True
                if not flag:
                    pos_scores.append([str(w), s, score[1], m_pos, m_neg])
                    f.write('\t'.join(
                        [str(w), str(s), str(score[1])] +
                        [str(x) for x in [m_pos, s_pos, m_neg, s_neg]]) + '\n')

        f.close()
        return pos_scores