def main(): # load matlab data print 'Loading Data !' mat = scipy.io.loadmat('../data/COIL20.mat') print 'Data Loaded !' X = mat['X'] X = X.astype(float) y = mat['Y'] y = y[:, 0] # construct W kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 0.1 } W = construct_W(X, **kwargs) # mcfs feature selection n_selected_features = 100 print 'Training Model !' S = MCFS.mcfs(X, n_selected_features, W=W, n_clusters=20) print 'Model Trained !' idx = MCFS.feature_ranking(S) # evaluation X_selected = X[:, idx[0:n_selected_features]] ari, nmi, acc = unsupervised_evaluation.evaluation(X_selected=X_selected, n_clusters=20, y=y) # print 'ARI:', ari # print 'NMI:', nmi print 'Accuracy:', round(acc * 100.0, 2), '%'
def reliefF(X, y): """ This function implements the reliefF feature selection, steps are as follows: 1. Construct the affinity matrix W in reliefF way 2. For the r-th feature, we define fr = X(:,r), reliefF score for the r-th feature is -1+fr'*W*fr Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ score: {numpy array}, shape (n_features,) reliefF score for each feature Reference --------- Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013. """ # construct the affinity matrix W kwargs = {"neighbor_mode": "supervised", "reliefF": True, 'y': y} W = construct_W.construct_W(X, **kwargs) n_samples, n_features = X.shape score = np.zeros(n_features) for i in range(n_features): score[i] = -1 + np.dot(np.transpose(X[:, i]), W.dot(X[:, i])) return score
def main(): # load matlab data print '-----------------------------------------' print 'Loading \'pixraw10P\' Data !' mat = scipy.io.loadmat('../data/COIL20.mat') print 'Data Loaded !' print '-----------------------------------------' X = mat['X'] # data y = mat['Y'] # label y = y[:, 0] X = X.astype(float) n_samples, n_features = X.shape # split data print 'Splitting data into 10 folds !' ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) print 'Data Splitted !' print '-----------------------------------------' # evaluation num_fea = 100 print 'Initializing KNN !' neigh = KNeighborsClassifier(n_neighbors=10) print 'KNN Initialized !' print '-----------------------------------------' correct = 0 fold_no = 0 for train, test in ss: print '\tFold No.', fold_no kwargs = { "neighbor_mode": "supervised", "fisher_score": True, 'y': y[train] } print 'Constructing Affinity Matrix !' # W = construct_W.construct_W(X[train], **kwargs) W = CW.construct_W(X[train], **kwargs) print 'Affinity Matrix Constructed !' print 'Calculating Fischer Score and ranking...' # score = fisher_score.fisher_score(X[train], y[train]) score = FS.fisher_score(X[train], y[train]) # idx = fisher_score.feature_ranking(score) idx = FS.feature_ranking(score) print 'Fischer Score and ranking calculated !' selected_features = X[:, idx[0:num_fea]] neigh.fit(selected_features[train], y[train]) y_predict = neigh.predict(selected_features[test]) acc = accuracy_score(y[test], y_predict) print acc correct = correct + acc fold_no += 1 print '-----------------------------------------' print '10 fold Cross - Validation Accuracy:', round( (float(correct) / 10) * 100.0, 2), '%'
def mcfs(X, n_selected_features, **kwargs): """ This function implements unsupervised feature selection for multi-cluster data. Input ----- X: {numpy array}, shape (n_samples, n_features) input data n_selected_features: {int} number of features to select kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) affinity matrix n_clusters: {int} number of clusters (default is 5) Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference --------- Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010. """ # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] # default number of clusters is 5 if 'n_clusters' not in kwargs: n_clusters = 5 else: n_clusters = kwargs['n_clusters'] # solve the generalized eigen-decomposition problem and get the top K # eigen-vectors with respect to the smallest eigenvalues W = W.toarray() W = (W + W.T) / 2 W_norm = np.diag(np.sqrt(1 / W.sum(1))) W = np.dot(W_norm, np.dot(W, W_norm)) WT = W.T W[W < WT] = WT[W < WT] eigen_value, ul = scipy.linalg.eigh(a=W) Y = np.dot(W_norm, ul[:, -1*n_clusters-1:-1]) # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d n_sample, n_feature = X.shape W = np.zeros((n_feature, n_clusters)) for i in range(n_clusters): clf = linear_model.Lars(n_nonzero_coefs=n_selected_features) clf.fit(X, Y[:, i]) W[:, i] = clf.coef_ return W
def fisher_score(X, y): import construct_W # Construct weight matrix W in a fisherScore way kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} W = construct_W.construct_W(X, **kwargs) # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp) / D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp) / D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 lap_score = 1 - np.array(np.multiply(L_prime, 1 / D_prime))[0, :] # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1 score = 1.0 / lap_score - 1 return np.transpose(score)
def trace_ratio(X, y, n_selected_features, **kwargs): import construct_W # if 'style' is not specified, use the fisher score way to built two affinity matrix if 'style' not in kwargs.keys(): kwargs['style'] = 'fisher' # get the way to build affinity matrix, 'fisher' or 'laplacian' style = kwargs['style'] n_samples, n_features = X.shape # if 'verbose' is not specified, do not output the value of objective function if 'verbose' not in kwargs: kwargs['verbose'] = False verbose = kwargs['verbose'] if style is 'fisher': kwargs_within = { "neighbor_mode": "supervised", "fisher_score": True, 'y': y } # build within class and between class laplacian matrix L_w and L_b W_within = construct_W.construct_W(X, **kwargs_within) L_within = np.eye(n_samples) - W_within L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples]) / n_samples L_between = L_within - L_tmp if style is 'laplacian': kwargs_within = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } # build within class and between class laplacian matrix L_w and L_b W_within = construct_W.construct_W(X, **kwargs_within) D_within = np.diag(np.array(W_within.sum(1))[:, 0]) L_within = D_within - W_within W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within) / np.sum(D_within) D_between = np.diag(np.array(W_between.sum(1))) L_between = D_between - W_between # build X'*L_within*X and X'*L_between*X L_within = (np.transpose(L_within) + L_within) / 2 L_between = (np.transpose(L_between) + L_between) / 2 S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X)) S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X)) # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X' S_within = (np.transpose(S_within) + S_within) / 2 # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X' S_between = (np.transpose(S_between) + S_between) / 2 # take the absolute values of diagonal s_within = np.absolute(S_within.diagonal()) s_between = np.absolute(S_between.diagonal()) s_between[s_between == 0] = 1e-14 # this number if from authors' code # preprocessing fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1] k = np.sum(s_between[0:n_selected_features]) / np.sum( s_within[0:n_selected_features]) s_within = s_within[fs_idx[0:n_selected_features]] s_between = s_between[fs_idx[0:n_selected_features]] # iterate util converge count = 0 while True: score = np.sort(s_between - k * s_within)[::-1] I = np.argsort(s_between - k * s_within)[::-1] idx = I[0:n_selected_features] old_k = k k = np.sum(s_between[idx]) / np.sum(s_within[idx]) if verbose: print('obj at iter {0}: {1}'.format(count + 1, k)) count += 1 if abs(k - old_k) < 1e-3: break # get feature index, feature-level score and subset-level score feature_idx = fs_idx[I] feature_score = score subset_score = k return feature_idx, feature_score, subset_score
#!/usr/bin/env python2 # -*- coding: utf-8 -*- import numpy as np import scipy.io from ConstructPairwiseDistance import ConstructPairwiseDistance from sklearn.metrics.pairwise import pairwise_distances import datetime import construct_W from skfeature.function.sparse_learning_based.MCFS import mcfs mat = scipy.io.loadmat("COIL20.mat") X = mat['X'] kwrags_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, "t": 1 } W = construct_W.construct_W(X, **kwrags_W) print W weightMat = mcfs(X, 10, **{"W": W, "n_clusters": 20}) print weightMat print weightMat.shape np.savetxt("a.txt", weightMat, fmt='%.5f')