def rank_features_using_chisquare(cls,
                                      data_frame,
                                      target_key,
                                      cols_to_ignore=None):
        X = data_frame.values
        keys = list(data_frame.keys())
        target_col_idx = keys.index(target_key)

        # Removing the target column from keys
        del keys[target_col_idx]

        # Remove all columns that are asked to be ignored
        if cols_to_ignore is not None:
            for col in cols_to_ignore:
                idx = keys.index(col)
                del keys[idx]

        Y = data_frame.loc[:, target_key].values
        X = data_frame.loc[:, keys]
        neg_test_result = np.any(X < 0, axis=0)
        non_negative_value_columns = [
            keys[i] for i, res in enumerate(neg_test_result) if not res
        ]

        # Het data for only positive valued columns
        X = data_frame.loc[:, non_negative_value_columns]

        score = chi_square.chi_square(X, Y)
        rank = chi_square.feature_ranking(score)
        ranked_features = [non_negative_value_columns[i] for i in rank]
        return score, ranked_features, non_negative_value_columns
예제 #2
0
    def apply_impl(self, data):
        X, y = data.Xy
        # TODO: verify if is possible implement this with numpy
        y = pd.Categorical(y).codes

        self._score = chi_square.chi_square(X, y)
        # Input X must be non-negative. <- This happens when some scaler
        # generates negative values.

        self._rank = chi_square.feature_ranking(self._score)
        self._nro_features = math.ceil((self.ratio) * X.shape[1])

        return self.use_impl(data)
예제 #3
0
def run_fold(trial,P,X,y,method,dataset,parttype):
    print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial)
    n_samples, n_features = X.shape
    train = P[:,trial] == 1
    trnX = X[train]
    trnY = y[train]

    start_time = time.time()
    if method == 'fisher': 
        score = fisher_score.fisher_score(trnX,trnY)
        features = fisher_score.feature_ranking(score)
    elif method == 'chi2':
        score = chi_square.chi_square(trnX,trnY)
        features = chi_square.feature_ranking(score)
    elif method == 'relieff':
        score = reliefF.reliefF(trnX,trnY)
        features = reliefF.feature_ranking(score)
    elif method == 'jmi':
        features = JMI.jmi(trnX,trnY,  n_selected_features=n_features)
    elif method == 'mrmr':
        features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features)
    elif method == 'infogain':
        features = MIM.mim(trnX,trnY,n_selected_features=n_features)
    elif method == 'svmrfe':
        features = svmrfe(trnX,trnY)
    elif method == 'hdmr':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    elif method == 'hdmrhaar':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    else:
        print(method + 'does no exist')

    cputime = time.time() - start_time
    print features
    print 'cputime %f' % cputime
    return {'features': features, 'cputime': cputime}
    def test_chi_squared(self):
        X, y = self.DATA

        f = FilterChiSquare(ratio=0.5)
        f.fit(X, y)
        X_, y_ = f.transform(X, y)

        score = chi_square.chi_square(X, y)
        rank = chi_square.feature_ranking(score)
        selected = rank[0:5]

        assert f.fit(X, y) is f
        assert np.array_equal(f.rank(), rank)
        assert np.allclose(f.score(), score)
        assert np.array_equal(f.selected(), selected)
        assert np.allclose(X_, X[:, selected])
        assert np.array_equal(y_, y)
예제 #5
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/BASEHOCK.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features
    print X.shape
    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the chi-square score of each feature
        score = chi_square.chi_square(X, y)

        # rank features in descending order according to score
        idx = chi_square.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct) / 10
예제 #6
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/BASEHOCK.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the chi-square score of each feature
        score = chi_square.chi_square(X, y)

        # rank features in descending order according to score
        idx = chi_square.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print ('Accuracy:', float(correct)/10)
예제 #7
0
X = dataset.iloc[:, 2:
                 32]  # [all rows, col from index 2 to the last one excluding 'Unnamed: 32']
y = dataset.iloc[:,
                 1]  # [all rows, col one only which contains the classes of cancer]
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
X_train = X_train.values
X_test = X_test.values

# compute RFS scores
score = RFS(X_train, y_train)
idx = feature_ranking(score)
np.save('features/rfs.npy', idx)
print('Features saved')
#idx = np.load('features/rfs.npy')

# create copies of the data
X_train_copy = X_train
y_train_copy = y_train
X_test_copy = X_test
y_test_copy = y_test

# train and compute accuracy of final model trained on selected features
final_list = []
for num_fea in range(30, 0, -1):
    # load the copies of the original data
    X_train = X_train_copy
예제 #8
0
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # reliefF
        score = reliefF.reliefF(X_train, y_train)
        idx = reliefF.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # chi_square
        score = chi_square.chi_square(np.abs(X_train), y_train)
        idx = chi_square.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # pca
        pca = PCA(n_components=num_features)
        pca.fit(X_train)
        selected_fea_train = pca.transform(X_train)
        selected_fea_test = pca.transform(X_test)
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # rfe
        estimator = LinearSVC(random_state=random_state)
예제 #9
0
def chi_square_FS(X, y):
    score = chi_square.chi_square(X, y)
    idx = chi_square.feature_ranking(score)
    return (idx, score)