def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} W = construct_W.construct_W(X, **kwargs) num_fea = 100 # specify the number of selected features num_cluster = 20 # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = MCFS.feature_ranking(Weight) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def mcfs(X, n_selected_features, **kwargs): """ This function implements unsupervised feature selection for multi-cluster data. Input ----- X: {numpy array}, shape (n_samples, n_features) input data n_selected_features: {int} number of features to select kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) affinity matrix n_clusters: {int} number of clusters (default is 5) Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference --------- Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010. """ # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] # default number of clusters is 5 if 'n_clusters' not in kwargs: n_clusters = 5 else: n_clusters = kwargs['n_clusters'] # solve the generalized eigen-decomposition problem and get the top K # eigen-vectors with respect to the smallest eigenvalues W = W.toarray() W = (W + W.T) / 2 W_norm = np.diag(np.sqrt(1 / W.sum(1))) W = np.dot(W_norm, np.dot(W, W_norm)) WT = W.T W[W < WT] = WT[W < WT] eigen_value, ul = scipy.linalg.eigh(a=W) Y = np.dot(W_norm, ul[:, -1 * n_clusters - 1:-1]) # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d n_sample, n_feature = X.shape W = np.zeros((n_feature, n_clusters)) for i in range(n_clusters): clf = linear_model.Lars(n_nonzero_coefs=n_selected_features) clf.fit(X, Y[:, i]) W[:, i] = clf.coef_ return W
def mcfs(X, n_selected_features, **kwargs): """ This function implements unsupervised feature selection for multi-cluster data. Input ----- X: {numpy array}, shape (n_samples, n_features) input data n_selected_features: {int} number of features to select kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) affinity matrix n_clusters: {int} number of clusters (default is 5) Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference --------- Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010. """ # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] # default number of clusters is 5 if 'n_clusters' not in kwargs: n_clusters = 5 else: n_clusters = kwargs['n_clusters'] # solve the generalized eigen-decomposition problem and get the top K # eigen-vectors with respect to the smallest eigenvalues W = W.toarray() W = (W + W.T) / 2 W_norm = np.diag(np.sqrt(1 / W.sum(1))) W = np.dot(W_norm, np.dot(W, W_norm)) WT = W.T W[W < WT] = WT[W < WT] eigen_value, ul = scipy.linalg.eigh(a=W) Y = np.dot(W_norm, ul[:, -1*n_clusters-1:-1]) # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d n_sample, n_feature = X.shape W = np.zeros((n_feature, n_clusters)) for i in range(n_clusters): clf = linear_model.Lars(n_nonzero_coefs=n_selected_features) clf.fit(X, Y[:, i]) W[:, i] = clf.coef_ return W
def calc_lap_score(data): kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(data, **kwargs_W) return lap_score.lap_score(data, W=W)
def utilize_selection_method(self, options): logging.info(' Unsupervised Feature Selection : Start') self.parse_options(options) normalize_feature = SupervisedFs.normalize_feature(self.data_feature) feature_amount = len(self.data_feature[0]) selection_result = {} if self.options['v'] == 1: widget = ['Calculating Variance : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ', pb.ETA()] timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start() variance = [] for n in range(0, feature_amount): variance.append([np.var(normalize_feature[:, n]), n+1]) timer.update(n) timer.finish() selection_result['variance'] = sorted(variance, reverse=True) if self.options['l'] == 1: logging.info(' -----Calculating Laplacian score---- ') kwargs_w = {'metric': 'euclidean', 'neighbor': 'knn', 'weight_mode': 'heat_kernel', 'k': 5, 't': 1} W = construct_W.construct_W(self.data_feature, **kwargs_w) score = lap_score.lap_score(self.data_feature, W=W) lap = [] for n in range(0, feature_amount): lap.append([score[n], n+1]) selection_result['laplacian'] = sorted(lap, reverse=False) logging.info(' -----Calculating Laplacian score---- ==> Done') if self.options['s'] == 1: logging.info(' -----Calculating Spectral score---- ') kwargs_w = {'metric': 'euclidean', 'neighbor': 'knn', 'weight_mode': 'heat_kernel', 'k': 5, 't': 1} W = construct_W.construct_W(self.data_feature, **kwargs_w) kwargs_s = {'style': 2, 'W': W} score = SPEC.spec(self.data_feature, **kwargs_s) spec = [] for n in range(0, feature_amount): spec.append([score[n], n+1]) selection_result['spectral'] = sorted(spec, reverse=True) logging.info(' -----Calculating Spectral score---- ==> Done') return selection_result
def lap_score(X, **kwargs): """ This function implements the laplacian score feature selection, steps are as follows: 1. Construct the affinity matrix W if it is not specified 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat) Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) input affinity matrix Output ------ score: {numpy array}, shape (n_features,) laplacian score for each feature Reference --------- He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. """ # if 'W' is not specified, use the default W if 'W' not in kwargs.keys(): W = construct_W(X) else: # construct the affinity matrix W W = kwargs['W'] # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp) / D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp) / D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 # compute laplacian score for all features score = 1 - np.array(np.multiply(L_prime, 1 / D_prime))[0, :] return np.transpose(score)
def calc_NDFS(data, n_clusters=20): kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(data, **kwargs) # obtain the feature weight matrix Weight = NDFS.ndfs(data, W=W, n_clusters=n_clusters) return (Weight * Weight).sum(1)
def SKF_lap(X, y): # construct affinity matrix kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) return lap_score.feature_ranking(score)
def calc_MCFS(data, n_features, n_clusters=20): kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(data, **kwargs_W) return MCFS.mcfs(data, n_selected_features=n_features, W=W, n_clusters=n_clusters).max(1)
def test_lap_score(): # load data from functools import partial mat = scipy.io.loadmat('./data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs_W) num_fea = 100 # number of selected features pipeline = [] # partial function required for SelectKBest to work correctly. lap_score_partial = partial(lap_score.lap_score, W=W) pipeline.append( ('select top k', SelectKBest(score_func=lap_score_partial, k=num_fea))) model = Pipeline(pipeline) # set y param to be 0 to demonstrate that this works in unsupervised sense. selected_features = model.fit_transform(X, y=np.zeros(X.shape[0])) print(selected_features.shape) # perform evaluation on clustering task num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print(('NMI:', float(nmi_total) / 20)) print(('ACC:', float(acc_total) / 20)) assert_true(float(nmi_total) / 20 > 0.5) assert_true(float(acc_total) / 20 > 0.5)
def spec(self, community: int, attributes: list, percentile=0): result = [] percentile = 0.1 attributes = list( filter(lambda x: x != 'nodeId' and x != 'id' and x != 'community', attributes)) print(len(attributes)) print('Attributes ', attributes) nodes_amount = self.get_nodes_amount_of_community(community) community_as_matrix = np.empty((nodes_amount, len(attributes))) community_nodes = self.get_community_nodes(community) node_index = 0 for node in community_nodes: for attribute_index in range(len(attributes)): community_as_matrix[node_index, attribute_index] = node[ attributes[attribute_index]] node_index += 1 if nodes_amount >= 5: w_matrix = construct_W(community_as_matrix) else: w_matrix = construct_W(community_as_matrix, k=(nodes_amount - 1)) # w_matrix = construct_W(community_as_matrix) scores = SPEC.spec(community_as_matrix, W=w_matrix) ranked_attributes = feature_ranking(scores) boundary = len(attributes) * percentile # boundary = 1 print('Percentile ', percentile) print('Boundary ', boundary) print('Ranked attributes ', ranked_attributes) for i in range(len(attributes)): if ranked_attributes[i] < boundary: result.append(attributes[i]) return result
def lap_score(X, **kwargs): """ This function implements the laplacian score feature selection, steps are as follows: 1. Construct the affinity matrix W if it is not specified 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat) Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) input affinity matrix Output ------ score: {numpy array}, shape (n_features,) laplacian score for each feature Reference --------- He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. """ # if 'W' is not specified, use the default W if 'W' not in kwargs.keys(): W = construct_W(X) # construct the affinity matrix W W = kwargs['W'] # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 # compute laplacian score for all features score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] return np.transpose(score)
def fisher_score(X, y): """ This function implements the fisher score feature selection, steps are as follows: 1. Construct the affinity matrix W in fisher score way 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1 Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ score: {numpy array}, shape (n_features,) fisher score for each feature Reference --------- He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012. """ # Construct weight matrix W in a fisherScore way kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} W = construct_W(X, **kwargs) # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1 score = 1.0/lap_score - 1 return np.transpose(score)
def fisher_score(X, y): """ This function implements the fisher score feature selection, steps are as follows: 1. Construct the affinity matrix W in fisher score way 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1 Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ score: {numpy array}, shape (n_features,) fisher score for each feature Reference --------- He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012. """ # Construct weight matrix W in a fisherScore way kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} W = construct_W(X, **kwargs) # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp) / D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp) / D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 lap_score = 1 - np.array(np.multiply(L_prime, 1 / D_prime))[0, :] # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1 score = 1.0 / lap_score - 1 return np.transpose(score)
def SKF_ndfs(X, y): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W(X, **kwargs) num_cluster = len( set(y) ) # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = NDFS.ndfs(X, W=W, n_clusters=num_cluster) return sparse_learning.feature_ranking(Weight)
def mcfs(trnin, num_fea): from skfeature.utility import construct_W kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(trnin, **kwargs_W) from skfeature.function.sparse_learning_based import MCFS score = MCFS.mcfs(trnin, num_fea, W=W) idx = MCFS.feature_ranking(score) selfea = idx[0:num_fea] return selfea
def get_lap_score(data, k=5, t=1,top_feature = 30): kwargs_W = {"metric":"euclidean","neighbor_mode":"knn","weight_mode":"heat_kernel","k":k,'t':t} W = construct_W.construct_W(data, **kwargs_W) score = lap_score.lap_score(data, W=W) #print(score) ranking = lap_score.feature_ranking(score) #print(idx) dfscores = pd.DataFrame(score) dfcolumns = pd.DataFrame(data.columns) #df_rank = pd.DataFrame(idx) featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Feature','Score'] #naming the dataframe columns #print(featureScores.nlargest(k,'Score')) #print 20 best features result = featureScores.nlargest(top_feature,'Score') return result, ranking
def laplacian_score(X, y=None, **kwargs): # construct affinity matrix kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) return idx
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs) # obtain the feature weight matrix Weight = NDFS.ndfs(X, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(Weight) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print('NMI:', float(nmi_total) / 20) print('ACC:', float(acc_total) / 20)
def Laplacian_score(diheds): import scipy.io import numpy import os #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0') from skfeature.function.similarity_based import lap_score from skfeature.utility import construct_W from numpy import mean kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} idx = [] #change the path for every system to be run. #os.chdir('/home/anu/Downloads/traj_benz_trypsin/') for i in range(len(diheds)): X= diheds[i] W = construct_W.construct_W(X, **kwargs_W) score = lap_score.lap_score(X, W=W) idx.append(score) col_mean = mean(idx, axis =0) imp_features = numpy.argsort(col_mean) return col_mean,imp_features
def plot_ls_after_vt_filtering(self, threshold): data = self.test_reddy_dataset.expression_data.copy() vt_data = self.variance_threshold_selector(data, threshold) # perform ls filtering vt_numpy = vt_data.to_numpy() # construct affinity matrix kwargs_W = { "metric": "cosine", "neighbor_mode": "knn", "weight_mode": "cosine", "k": 40, 't': 500 } print( "We plot the Laplacian scores of the features using the following affinity matrix parameters: " + str(kwargs_W)) W = construct_W.construct_W(vt_numpy, **kwargs_W) # compute lap score of each remaining features score = lap_score.lap_score(vt_numpy, W=W) self.plot_lap_scores(score)
def SKF_mcfs(X, y): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W(X, **kwargs) num_fea = X.shape[1] # specify the number of selected features num_cluster = len( set(y) ) # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=num_cluster) return MCFS.feature_ranking(Weight)
def lap_score_filtering(self, vt_data, num_features): vt_numpy = vt_data.to_numpy() # construct affinity matrix kwargs_W = { "metric": "cosine", "neighbor_mode": "knn", "weight_mode": "cosine", "k": 40, 't': 500 } print( "We perform Laplacian score filtering using the following parameters: " + str(kwargs_W)) W = construct_W.construct_W(vt_numpy, **kwargs_W) score = lap_score.lap_score(vt_numpy, W=W) idx = lap_score.feature_ranking(score) # rank features filtered_data = vt_data.iloc[:, idx[0:num_features]].copy() print("\nThe data now has " + str(len(filtered_data.T)) + " features after Laplacian score filtering.") return filtered_data
def mcfs_score(diheds): import scipy.io import numpy from numpy import mean import os #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0') from skfeature.function.sparse_learning_based import MCFS from skfeature.utility import construct_W from skfeature.utility import unsupervised_evaluation idx = [] kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} #change the path for every system to be run. #os.chdir('/home/anu/Downloads/DESRES-Trajectory_GTT-1-protein/GTT-1-protein') for i in range(0,len(diheds),5): X= diheds[i] W = construct_W.construct_W(X, **kwargs) score = MCFS.mcfs(X, n_selected_features=20, W=W, n_clusters=20) idx.append(score) col_mean = mean(idx, axis =0) imp_features=MCFS.feature_ranking(col_mean) return col_mean,imp_features
def MCFS_FS(X_train, k): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X_train, **kwargs) num_fea_ = k # specify the number of selected features num_cluster = 20 # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X_train, n_selected_features=num_fea_, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = MCFS.feature_ranking(Weight) return (idx, Weight)
def MCFS(X, y=None, **kwargs): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs) num_cluster = len(np.unique(y)) # obtain the feature weight matrix Weight = MCFS_CLASS.mcfs(X, n_selected_features=X.shape[1], W=W, n_clusters=num_cluster) # sort the feature scores in an ascending order according to the feature scores idx = MCFS_CLASS.feature_ranking(Weight) return idx
def main(): # load data mat = scipy.io.loadmat("../data/COIL20.mat") X = mat["X"] # data X = X.astype(float) y = mat["Y"] # label y = y[:, 0] # construct affinity matrix kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, "t": 1} W = construct_W.construct_W(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print "NMI:", float(nmi_total) / 20 print "ACC:", float(acc_total) / 20
def predict(self, X): """ :param X: shape [n_row*n_clm, n_band] :return: """ # n_row, n_column, __n_band = X.shape # XX = X.reshape((n_row * n_column, -1)) # n_sample * n_band XX = X kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(XX, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:self.n_band]] # selected_features.reshape((self.n_band, n_row, n_column)) # selected_features = np.transpose(selected_features, axes=(1, 2, 0)) return selected_features
def bench(self, X, X_norm, y, n=2): num_feats = 20 output_data = {'method': list(), 'features': list(), 'time': list(), self.test_att: list(), 'supervised': list()} # ---------------------------------------------------------------- # CFS # start = time.perf_counter() # idx = cfs(X_norm.to_numpy(), y.to_numpy())[0] # print(idx) # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() # output_data['method'].append('CFS') # output_data['time'].append(time.perf_counter() - start) # output_data['features'].append(selected_features) # output_data[self.test_att].append(self.train_real_data(selected_features, X)) # LA: Laplacian Score start = time.perf_counter() kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(X_norm.to_numpy(), **kwargs_W) score = lap_score.lap_score(X_norm.to_numpy(), W=W) idx = lap_score.feature_ranking(score) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('Laplacian Score') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # FCBF: Feature correlation based filter # start = time.perf_counter() # idx = fcbf(X_norm.to_numpy(), y.to_numpy(), n_selected_features=num_feats)[0] # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() # output_data['method'].append('FCBF') # output_data['time'].append(time.perf_counter() - start) # output_data['features'].append(selected_features) # output_data['supervised'].append(True) # output_data[self.test_att].append(self.train_real_data(selected_features, X)) # print(output_data) # output_data['method'].append('FCBF') # output_data['time'].append(9999999) # output_data['features'].append([]) # output_data['supervised'].append(True) # output_data[self.test_att].append(0.0) # UDFS: Unsupervised Discriminative Feature Selection start = time.perf_counter() Weight = udfs(X_norm.to_numpy(), gamma=0.1, n_clusters=n) idx = feature_ranking(Weight) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('UDFS') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # SPEC: Spectral Feature Selection start = time.perf_counter() score = spec(X_norm.to_numpy()) idx = feature_ranking_spec(score) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('SPEC') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # Mrmr: minimum redundency maximum relevance start = time.perf_counter() mrmr = pymrmr.mRMR(X_norm, 'MIQ', num_feats) output_data['method'].append('MRMR(MIQ)') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(mrmr) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(mrmr, X)) print(output_data) # Mrmr: minimum redundency maximum relevance start = time.perf_counter() mrmr = pymrmr.mRMR(X_norm, 'MID', num_feats) output_data['method'].append('MRMR(MID)') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(mrmr) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(mrmr, X)) print(output_data) # recursive feature elimination(RFE): from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5) start = time.perf_counter() rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X_norm.loc[:, rfe_support].columns.tolist() output_data['method'].append('RFE') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(rfe_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(rfe_feature, X)) print(output_data) # ---------------------------------------------------------------- # Lasso: SelectFromModel: from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats) start = time.perf_counter() embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X_norm.loc[:, embeded_lr_support].columns.tolist() output_data['method'].append('Lasso') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(embeded_lr_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(embeded_lr_feature, X)) print(output_data) print(str(len(embeded_lr_feature)), 'selected features') # ----------------------------------------------------------------------------- # Tree - based: SelectFromModel: from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats) start = time.perf_counter() embeded_rf_selector.fit(X_norm, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = X_norm.loc[:, embeded_rf_support].columns.tolist() output_data['method'].append('Tree_Based_RF') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(embeded_rf_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(embeded_rf_feature, X)) print(output_data) print(str(len(embeded_rf_feature)), 'selected features') # ------------------------------------------------------------------------------- # also tree based: from sklearn.feature_selection import SelectFromModel from lightgbm import LGBMClassifier lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2, reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40) embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats) start = time.perf_counter() embeded_lgb_selector.fit(X_norm, y) embeded_lgb_support = embeded_lgb_selector.get_support() embeded_lgb_feature = X_norm.loc[:, embeded_lgb_support].columns.tolist() output_data['method'].append('Tree_Based_lightGBM') output_data['time'].append(time.perf_counter() - start) output_data['supervised'].append(True) output_data['features'].append(embeded_lgb_feature) output_data[self.test_att].append(self.train_real_data(embeded_lgb_feature, X)) print(output_data) print(str(len(embeded_lgb_feature)), 'selected features') return output_data
def ndfs(X, **kwargs): """ This function implement unsupervised feature selection using nonnegative spectral analysis, i.e., min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2 s.t. F >= 0 Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape {n_samples, n_samples} affinity matrix alpha: {float} Parameter alpha in objective function beta: {float} Parameter beta in objective function gamma: {float} a very large number used to force F^T F = I F0: {numpy array}, shape (n_samples, n_clusters) initialization of the pseudo label matirx F, if not provided n_clusters: {int} number of clusters verbose: {boolean} True if user want to print out the objective function value in each iteration, false if not Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference: Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012. """ # default gamma is 10e8 if 'gamma' not in kwargs: gamma = 10e8 else: gamma = kwargs['gamma'] # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] if 'alpha' not in kwargs: alpha = 1 else: alpha = kwargs['alpha'] if 'beta' not in kwargs: beta = 1 else: beta = kwargs['beta'] if 'F0' not in kwargs: if 'n_clusters' not in kwargs: print >> sys.stderr, "either F0 or n_clusters should be provided" else: # initialize F n_clusters = kwargs['n_clusters'] F = kmeans_initialization(X, n_clusters) else: F = kwargs['F0'] if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] n_samples, n_features = X.shape # initialize D as identity matrix D = np.identity(n_features) I = np.identity(n_samples) # build laplacian matrix L = np.array(W.sum(1))[:, 0] - W max_iter = 1000 obj = np.zeros(max_iter) for iter_step in range(max_iter): # update W T = np.linalg.inv( np.dot(X.transpose(), X) + beta * D + 1e-6 * np.eye(n_features)) W = np.dot(np.dot(T, X.transpose()), F) # update D temp = np.sqrt((W * W).sum(1)) temp[temp < 1e-16] = 1e-16 temp = 0.5 / temp D = np.diag(temp) # update M M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose())) M = (M + M.transpose()) / 2 # update F denominator = np.dot(M, F) + gamma * np.dot(np.dot(F, F.transpose()), F) temp = np.divide(gamma * F, denominator) F = F * np.array(temp) temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16)))) F = np.dot(F, temp) # calculate objective function obj[iter_step] = np.trace(np.dot(np.dot( F.transpose(), M), F)) + gamma / 4 * np.linalg.norm( np.dot(F.transpose(), F) - np.identity(n_clusters), 'fro') if verbose: print 'obj at iter ' + str(iter_step + 1) + ': ' + str( obj[iter_step]) if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step - 1]) < 1e-3: break return W
def trace_ratio(X, y, n_selected_features, **kwargs): """ This function implements the trace ratio criterion for feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels n_selected_features: {int} number of features to select kwargs: {dictionary} style: {string} style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way verbose: {boolean} True if user want to print out the objective function value in each iteration, False if not Output ------ feature_idx: {numpy array}, shape (n_features,) the ranked (descending order) feature index based on subset-level score feature_score: {numpy array}, shape (n_features,) the feature-level score subset_score: {float} the subset-level score Reference --------- Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008. """ # if 'style' is not specified, use the fisher score way to built two affinity matrix if 'style' not in kwargs.keys(): kwargs['style'] = 'fisher' # get the way to build affinity matrix, 'fisher' or 'laplacian' style = kwargs['style'] n_samples, n_features = X.shape # if 'verbose' is not specified, do not output the value of objective function if 'verbose' not in kwargs: kwargs['verbose'] = False verbose = kwargs['verbose'] if style is 'fisher': kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) L_within = np.eye(n_samples) - W_within L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples L_between = L_within - L_tmp if style is 'laplacian': kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) D_within = np.diag(np.array(W_within.sum(1))[:, 0]) L_within = D_within - W_within W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within) D_between = np.diag(np.array(W_between.sum(1))) L_between = D_between - W_between # build X'*L_within*X and X'*L_between*X L_within = (np.transpose(L_within) + L_within)/2 L_between = (np.transpose(L_between) + L_between)/2 S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X)) S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X)) # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X' S_within = (np.transpose(S_within) + S_within)/2 # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X' S_between = (np.transpose(S_between) + S_between)/2 # take the absolute values of diagonal s_within = np.absolute(S_within.diagonal()) s_between = np.absolute(S_between.diagonal()) s_between[s_between == 0] = 1e-14 # this number if from authors' code # preprocessing fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1] k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features]) s_within = s_within[fs_idx[0:n_selected_features]] s_between = s_between[fs_idx[0:n_selected_features]] # iterate util converge count = 0 while True: score = np.sort(s_between-k*s_within)[::-1] I = np.argsort(s_between-k*s_within)[::-1] idx = I[0:n_selected_features] old_k = k k = np.sum(s_between[idx])/np.sum(s_within[idx]) if verbose: print('obj at iter {0}: {1}'.format(count+1, k)) count += 1 if abs(k - old_k) < 1e-3: break # get feature index, feature-level score and subset-level score feature_idx = fs_idx[I] feature_score = score subset_score = k return feature_idx, feature_score, subset_score
# build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) print('Sparse Affinity Matrix:', W) ## Logging # with open('output.txt', 'a') as f: # print("W", file=f) # print(W, file=f) ##Euclidean laplacian result numTrainData = trainData.values kwargs_W = {"metric": "euclidean", "neighbour_mode": "knn"} W = construct_W.construct_W(numTrainData, **kwargs_W) ## Calculate Laplacian Score score = lap_score.lap_score(numTrainData, W=W) print('Laplacian Score:', score) ## Logging with open('output.txt', 'a') as f: print("Laplacian Score", file=f) print(score, file=f) # Laplacian HEOM result hardcoded """score = np.array( [np.nan, np.nan, np.nan, np.nan, 0.25866548, 0.25866548, np.nan, 0.25946108, np.nan, np.nan, np.nan, np.nan, 0.67265115, 0.73108302, np.nan, np.nan, np.nan, 0.86144223, np.nan, 0.6201575, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0.8655987, 0.85803891, 0.87968564, 0.88995775, 0.87647355, 0.86576088,
def ndfs(X, **kwargs): """ This function implement unsupervised feature selection using nonnegative spectral analysis, i.e., min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2 s.t. F >= 0 Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape {n_samples, n_samples} affinity matrix alpha: {float} Parameter alpha in objective function beta: {float} Parameter beta in objective function gamma: {float} a very large number used to force F^T F = I F0: {numpy array}, shape (n_samples, n_clusters) initialization of the pseudo label matirx F, if not provided n_clusters: {int} number of clusters verbose: {boolean} True if user want to print out the objective function value in each iteration, false if not Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference: Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012. """ # default gamma is 10e8 if 'gamma' not in kwargs: gamma = 10e8 else: gamma = kwargs['gamma'] # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] if 'alpha' not in kwargs: alpha = 1 else: alpha = kwargs['alpha'] if 'beta' not in kwargs: beta = 1 else: beta = kwargs['beta'] if 'F0' not in kwargs: if 'n_clusters' not in kwargs: print >>sys.stderr, "either F0 or n_clusters should be provided" else: # initialize F n_clusters = kwargs['n_clusters'] F = kmeans_initialization(X, n_clusters) else: F = kwargs['F0'] if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] n_samples, n_features = X.shape # initialize D as identity matrix D = np.identity(n_features) I = np.identity(n_samples) # build laplacian matrix L = np.array(W.sum(1))[:, 0] - W max_iter = 1000 obj = np.zeros(max_iter) for iter_step in range(max_iter): # update W T = np.linalg.inv(np.dot(X.transpose(), X) + beta * D + 1e-6*np.eye(n_features)) W = np.dot(np.dot(T, X.transpose()), F) # update D temp = np.sqrt((W*W).sum(1)) temp[temp < 1e-16] = 1e-16 temp = 0.5 / temp D = np.diag(temp) # update M M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose())) M = (M + M.transpose())/2 # update F denominator = np.dot(M, F) + gamma*np.dot(np.dot(F, F.transpose()), F) temp = np.divide(gamma*F, denominator) F = F*np.array(temp) temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16)))) F = np.dot(F, temp) # calculate objective function obj[iter_step] = np.trace(np.dot(np.dot(F.transpose(), M), F)) + gamma/4*np.linalg.norm(np.dot(F.transpose(), F)-np.identity(n_clusters), 'fro') if verbose: print('obj at iter ' + str(iter_step+1) + ': ' + str(obj[iter_step])) if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: break return W
def select(dataset, features_number, clusters_number): app_logger.info( 'STARTED [MCFS Selection] on {0} with features number = {1}'.format( dataset, features_number), extra=LOGGER_EXTRA_OBJECT) # Retrieving all feature extracted by tsfresh from the pickles on the disk current_dir = os.getcwd().split('\\')[-1] projet_dir = 'MCFS-Unsupervisioned-Feature-Selection' if current_dir == projet_dir: all_features_train = pd.read_pickle( 'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( 'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) else: all_features_train = pd.read_pickle( '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) app_logger.info( 'All features (including target column) trainset shape: {0}'.format( all_features_train.shape), extra=LOGGER_EXTRA_OBJECT) app_logger.info( 'All features (including target column) testset shape: {0}'.format( all_features_test.shape), extra=LOGGER_EXTRA_OBJECT) # np.savetxt(r'testDataFrame.txt', all_features_test.values, fmt='%d') # Retrieving indipendent columns of both set and known labels of the test set indipendent_columns_train = all_features_train.iloc[:, 1:] indipendent_columns_test = all_features_test.iloc[:, 1:] known_labels_test = all_features_test.iloc[:, 0] # Building matrix W for MCFS algorithm kwargs = { 'metric': 'euclidean', 'neighbor_mode': 'knn', 'weight_mode': 'binary', 'k': 3 # 'weight_mode': 'heat_kernel', # 'k': 5, # 't': 1 } W = construct_W.construct_W(indipendent_columns_train.values, **kwargs) # MCFS gives a weight to each features kwargs = {'W': W, 'n_clusters': clusters_number} weighted_features = MCFS.mcfs(indipendent_columns_train.values, features_number, **kwargs) # Ordering the features according to their weight ordered_features = MCFS.feature_ranking(weighted_features) # Getting only the first 'features_number' features selected_features = ordered_features[0:features_number] # Getting names of selected features names_selected_features = [] for feature_index in selected_features: names_selected_features.append( indipendent_columns_train.columns[feature_index]) # Selected only the selected features on the train set selected_features_train = indipendent_columns_train.loc[:, names_selected_features] app_logger.info('Selected features trainset: {0}'.format( selected_features_train.shape), extra=LOGGER_EXTRA_OBJECT) # Selected only the selected features on the test set selected_features_test = indipendent_columns_test.loc[:, names_selected_features] app_logger.info('Selected features testset: {0}'.format( selected_features_test.shape), extra=LOGGER_EXTRA_OBJECT) ''' # Pickles for rfd if selected_features_train.shape[0] > 1000: print('Test-set') selected_features_test.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset)) else: print('Train-set') selected_features_train.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset)) exit() ''' # Running k-means according to selected features test_feature_selection.testFeatureSelectionWithRepeatedKMeans( 'MCFS', features_number, dataset, selected_features_train.values, selected_features_test.values, clusters_number, known_labels_test) app_logger.info('ENDED [MCFS Selection] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT) # Testing #select('TwoPatterns', 10, 4)
def trace_ratio(X, y, n_selected_features=None, mode='rank', **kwargs): """ This function implements the trace ratio criterion for feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels n_selected_features: {int} number of features to select kwargs: {dictionary} style: {string} style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way verbose: {boolean} True if user want to print out the objective function value in each iteration, False if not Output ------ feature_idx: {numpy array}, shape (n_features,) the ranked (descending order) feature index based on subset-level score feature_score: {numpy array}, shape (n_features,) the feature-level score subset_score: {float} the subset-level score Reference --------- Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008. """ if n_selected_features is None: n_selected_features = X.shape[1] # if 'style' is not specified, use the fisher score way to built two affinity matrix if 'style' not in list(kwargs.keys()): kwargs['style'] = 'fisher' # get the way to build affinity matrix, 'fisher' or 'laplacian' style = kwargs['style'] n_samples, n_features = X.shape # if 'verbose' is not specified, do not output the value of objective function if 'verbose' not in kwargs: kwargs['verbose'] = False verbose = kwargs['verbose'] if style is 'fisher': kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) L_within = np.eye(n_samples) - W_within L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples L_between = L_within - L_tmp if style is 'laplacian': kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) D_within = np.diag(np.array(W_within.sum(1))[:, 0]) L_within = D_within - W_within W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within) D_between = np.diag(np.array(W_between.sum(1))) L_between = D_between - W_between # build X'*L_within*X and X'*L_between*X L_within = (np.transpose(L_within) + L_within)/2 L_between = (np.transpose(L_between) + L_between)/2 S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X)) S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X)) # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X' S_within = (np.transpose(S_within) + S_within)/2 # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X' S_between = (np.transpose(S_between) + S_between)/2 # take the absolute values of diagonal s_within = np.absolute(S_within.diagonal()) s_between = np.absolute(S_between.diagonal()) s_between[s_between == 0] = 1e-14 # this number if from authors' code # preprocessing fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1] k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features]) s_within = s_within[fs_idx[0:n_selected_features]] s_between = s_between[fs_idx[0:n_selected_features]] # iterate util converge count = 0 while True: score = np.sort(s_between-k*s_within)[::-1] I = np.argsort(s_between-k*s_within)[::-1] idx = I[0:n_selected_features] old_k = k k = np.sum(s_between[idx])/np.sum(s_within[idx]) if verbose: print('obj at iter {0}: {1}'.format(count+1, k)) count += 1 if abs(k - old_k) < 1e-3: break # get feature index, feature-level score and subset-level score feature_idx = fs_idx[I] feature_score = score subset_score = k if mode == 'raw': return feature_idx, feature_score, subset_score elif mode == 'index': return feature_idx else: return reverse_argsort(feature_idx)
print(count) X_train, X_test = features[train_index], features[test_index] y_train, y_test = labels[train_index], labels[test_index] start_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) acc = [] # lap_score method = 'lap_score' kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X_train, **kwargs_W) score = lap_score.lap_score(X_train, W=W) idx = lap_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # fisher_score score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))
import urllib2 # URL for the Pima Indians Diabetes dataset (UCI Machine Learning Repository) url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data" # download the file raw_data = urllib2.urlopen(url) # load the CSV file as a numpy matrix dataset = np.loadtxt(raw_data, delimiter=",") X = dataset[:,:9] y = dataset[:,8] kwargs_W = {"metric":"euclidean","neighbor_mode":"knn","weight_mode":"heat_kernel","k":5,'t':1} W = construct_W.construct_W(X, **kwargs_W) from skfeature.function.similarity_based import lap_score score = lap_score.lap_score(X, W=W) print score idx = lap_score.feature_ranking(score) fig = plt.figure() plt.plot(score, label='Laplacian Score') plt.legend(loc='upper middle', shadow=True) plt.show() print idx num_fea = 3 #selected_features = X[:, idx[0:num_fea]] #print selected_features