def rank_features_using_chisquare(cls,
                                      data_frame,
                                      target_key,
                                      cols_to_ignore=None):
        X = data_frame.values
        keys = list(data_frame.keys())
        target_col_idx = keys.index(target_key)

        # Removing the target column from keys
        del keys[target_col_idx]

        # Remove all columns that are asked to be ignored
        if cols_to_ignore is not None:
            for col in cols_to_ignore:
                idx = keys.index(col)
                del keys[idx]

        Y = data_frame.loc[:, target_key].values
        X = data_frame.loc[:, keys]
        neg_test_result = np.any(X < 0, axis=0)
        non_negative_value_columns = [
            keys[i] for i, res in enumerate(neg_test_result) if not res
        ]

        # Het data for only positive valued columns
        X = data_frame.loc[:, non_negative_value_columns]

        score = chi_square.chi_square(X, Y)
        rank = chi_square.feature_ranking(score)
        ranked_features = [non_negative_value_columns[i] for i in rank]
        return score, ranked_features, non_negative_value_columns
Пример #2
0
    def apply_impl(self, data):
        X, y = data.Xy
        # TODO: verify if is possible implement this with numpy
        y = pd.Categorical(y).codes

        self._score = chi_square.chi_square(X, y)
        # Input X must be non-negative. <- This happens when some scaler
        # generates negative values.

        self._rank = chi_square.feature_ranking(self._score)
        self._nro_features = math.ceil((self.ratio) * X.shape[1])

        return self.use_impl(data)
Пример #3
0
def run_fold(trial,P,X,y,method,dataset,parttype):
    print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial)
    n_samples, n_features = X.shape
    train = P[:,trial] == 1
    trnX = X[train]
    trnY = y[train]

    start_time = time.time()
    if method == 'fisher': 
        score = fisher_score.fisher_score(trnX,trnY)
        features = fisher_score.feature_ranking(score)
    elif method == 'chi2':
        score = chi_square.chi_square(trnX,trnY)
        features = chi_square.feature_ranking(score)
    elif method == 'relieff':
        score = reliefF.reliefF(trnX,trnY)
        features = reliefF.feature_ranking(score)
    elif method == 'jmi':
        features = JMI.jmi(trnX,trnY,  n_selected_features=n_features)
    elif method == 'mrmr':
        features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features)
    elif method == 'infogain':
        features = MIM.mim(trnX,trnY,n_selected_features=n_features)
    elif method == 'svmrfe':
        features = svmrfe(trnX,trnY)
    elif method == 'hdmr':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    elif method == 'hdmrhaar':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    else:
        print(method + 'does no exist')

    cputime = time.time() - start_time
    print features
    print 'cputime %f' % cputime
    return {'features': features, 'cputime': cputime}
    def test_chi_squared(self):
        X, y = self.DATA

        f = FilterChiSquare(ratio=0.5)
        f.fit(X, y)
        X_, y_ = f.transform(X, y)

        score = chi_square.chi_square(X, y)
        rank = chi_square.feature_ranking(score)
        selected = rank[0:5]

        assert f.fit(X, y) is f
        assert np.array_equal(f.rank(), rank)
        assert np.allclose(f.score(), score)
        assert np.array_equal(f.selected(), selected)
        assert np.allclose(X_, X[:, selected])
        assert np.array_equal(y_, y)
Пример #5
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/BASEHOCK.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features
    print X.shape
    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the chi-square score of each feature
        score = chi_square.chi_square(X, y)

        # rank features in descending order according to score
        idx = chi_square.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct) / 10
Пример #6
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/BASEHOCK.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the chi-square score of each feature
        score = chi_square.chi_square(X, y)

        # rank features in descending order according to score
        idx = chi_square.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print ('Accuracy:', float(correct)/10)
Пример #7
0
        idx = fisher_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # reliefF
        score = reliefF.reliefF(X_train, y_train)
        idx = reliefF.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # chi_square
        score = chi_square.chi_square(np.abs(X_train), y_train)
        idx = chi_square.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # pca
        pca = PCA(n_components=num_features)
        pca.fit(X_train)
        selected_fea_train = pca.transform(X_train)
        selected_fea_test = pca.transform(X_test)
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # rfe
Пример #8
0
def chi_square_FS(X, y):
    score = chi_square.chi_square(X, y)
    idx = chi_square.feature_ranking(score)
    return (idx, score)
Пример #9
0
def predict(fs_algorithm=None,
            dataframe=None,
            dataset=None,
            C=1.0,
            epsilon=0.1):
    sc = MinMaxScaler(feature_range=(0, 10))
    best_sort_feature = []

    if dataframe == None and dataset == None:
        return None

    if dataset:
        dataset_data = [model_to_dict(data) for data in dataset]

        df = pd.DataFrame(dataset_data)

        city_id = np.asarray(df['city'])
        raw_X = np.asarray(
            df.loc[:, 'sum_price_car':'std_buyer_land_rent'])  # features
        raw_y = np.asarray(df['BPS_poverty_rate'])  # label

    if dataframe:
        df = pd.read_excel(dataframe)

        city_id = np.asarray(df['city_id'])
        raw_X = np.asarray(
            df.loc[:, 'sum_price_car':'std_buyer_land_rent'])  # features
        raw_y = np.asarray(df['BPS_poverty_rate'])  # label

    # 2. pre-processing
    clean_X = np.nan_to_num(raw_X)
    clean_y = np.nan_to_num(raw_y)

    # 3. normalization
    sc.fit(raw_X)
    X = np.array(sc.transform(clean_X))
    y = np.array(clean_y)

    if fs_algorithm == "f_score":
        ranked_index = f_score.f_score(X, y, mode="index")
    elif fs_algorithm == "chi_square":
        X_feature = X.astype(int)
        y_label = y.astype(int)
        ranked_index = chi_square.chi_square(X_feature, y_label, mode="index")
    elif fs_algorithm == "cfs":
        ranked_index = CFS.cfs(X, y)

    for row in X:
        row_array = []
        for num, feature_idx in enumerate(ranked_index):
            row_array.append(row[feature_idx])
        best_sort_feature.append(row_array)

    # 5. get best feature predict score
    best_pred, best_score, result, ten_column_predictions \
        = trainf(best_sort_feature, y, C, epsilon)

    now_unix_timestamp = str(datetime.utcnow().timestamp())
    time = now_unix_timestamp.split(".")[0]

    # set filename
    filename = "dumped_model/svr_" + fs_algorithm + "_" + time + "_.sav"
    # get full file path
    SITE_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    full_model_file_path = SITE_ROOT + "/" + filename

    # get regressor
    regressor = best_score[3]
    pickle.dump(regressor, open(filename, 'wb'))
    """
    RETURN VALUES
    hasil return dari prediksi SVR
    """

    # modified best prediction return value
    best_pred = [best_pred, dict(zip(city_id, best_pred))]

    # nambahin list ranked index dari pecahan 10 terbaik
    best_score.append(ranked_index[:best_score[2]])

    y_true = y
    """
    1. best prediction => hasil prediksi poverty rate (dictionary, key => city_id)
    2. detail => detail best score (array)
        .best_score => r2
        .lowest_score => rmse
        .jumlah fitur dengan terbaik
        .model terbaik
        .urutan fitur terbaik
    3. result => detail hasil r2 dari 10 fitur hingga 96 fitur (array)
        -> [fitur, r2, rmse]
    4. hasil percobaan prediksi per 10 fitur (array)
    5. actual poverty rate
    6. filename
    """
    return best_pred, best_score, result, ten_column_predictions, y_true, full_model_file_path
# 随机采样1000个样本用于计算

X = np.array(train_data)
y = np.array(train_label)

X_relief, y_relief = shuffle(X, y, n_samples=10000, random_state=0)
'''
Filter
方法:
Distance:RelieF
Dependence:Chi-squared
Information:MIFS (Mutual Information Feature 
'''
# Relief 和 Chi 都是给出每个特征值的一个score,MIFS稍有不同,电脑是第二行也可以当作一个分数,将这三种分数都归一化为0-1之间的数值,求平均
RelieF_score = reliefF.reliefF(X_relief, y_relief[:, 0], k=n_features)  # RelieF
Chi = chi_square.chi_square(X, y[:, 0])
# 返回值,第一行为特征值排序后的结果,第二行为目标函数,第三行是自变量与相应变量之间的互信息
Mifs = MIFS.mifs(X_relief, y_relief[:, 0], n_selected_features=n_features)

'''
使用mean method 进行选择融合
'''
scores = pd.DataFrame({'Feature': list(Mifs[0]), 'MIFS': list(Mifs[1])})
scores = scores.sort_values(by=['Feature'])
scores['Relief'] = RelieF_score
scores['Chi'] = Chi
# 归一化
min_max_scaler = preprocessing.MinMaxScaler()
scores['MIFS_scaler'] = min_max_scaler.fit_transform(scores.loc[:, ['MIFS']])
scores['Relief_scaler'] = min_max_scaler.fit_transform(scores.loc[:, ['Relief']])
scores['Chi_scaler'] = min_max_scaler.fit_transform(scores.loc[:, ['Chi']])