def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} W = construct_W.construct_W(X, **kwargs) num_fea = 100 # specify the number of selected features num_cluster = 20 # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = MCFS.feature_ranking(Weight) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def featureSelection(listNewTimeSeries, numFeature, numCluster=5): """ @description : select features using MCFS algorithem. --------- @param : numFeature -- how many features are to be selected. numCluster -- parameter required in MFCS, deafault is set to 5. ------- @Returns : selected_features -- selected features idx -- the indexes of selected features in original feature set. ------- """ kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } listNewTimeSeries = np.array(listNewTimeSeries) W = construct_W.construct_W(listNewTimeSeries, **kwargs) Weight = MCFS.mcfs(listNewTimeSeries, n_selected_features=numFeature, W=W, n_clusters=numCluster) idx = MCFS.feature_ranking(Weight) selected_features = listNewTimeSeries[:, idx[0:numFeature]] return selected_features, idx
def mcfs(trnin, num_fea): from skfeature.utility import construct_W kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(trnin, **kwargs_W) from skfeature.function.sparse_learning_based import MCFS score = MCFS.mcfs(trnin, num_fea, W=W) idx = MCFS.feature_ranking(score) selfea = idx[0:num_fea] return selfea
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_w.construct_w(X, **kwargs) num_fea = 100 # specify the number of selected features num_cluster = 20 # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = MCFS.feature_ranking(Weight) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print('NMI:', old_div(float(nmi_total), 20)) print('ACC:', old_div(float(acc_total), 20))
def SKF_mcfs(X, y): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W(X, **kwargs) num_fea = X.shape[1] # specify the number of selected features num_cluster = len( set(y) ) # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=num_cluster) return MCFS.feature_ranking(Weight)
def MCFS_FS(X_train, k): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X_train, **kwargs) num_fea_ = k # specify the number of selected features num_cluster = 20 # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X_train, n_selected_features=num_fea_, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = MCFS.feature_ranking(Weight) return (idx, Weight)
def mcfs_score(diheds): import scipy.io import numpy from numpy import mean import os #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0') from skfeature.function.sparse_learning_based import MCFS from skfeature.utility import construct_W from skfeature.utility import unsupervised_evaluation idx = [] kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} #change the path for every system to be run. #os.chdir('/home/anu/Downloads/DESRES-Trajectory_GTT-1-protein/GTT-1-protein') for i in range(0,len(diheds),5): X= diheds[i] W = construct_W.construct_W(X, **kwargs) score = MCFS.mcfs(X, n_selected_features=20, W=W, n_clusters=20) idx.append(score) col_mean = mean(idx, axis =0) imp_features=MCFS.feature_ranking(col_mean) return col_mean,imp_features
def calc_MCFS(data, n_features, n_clusters=20): kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(data, **kwargs_W) return MCFS.mcfs(data, n_selected_features=n_features, W=W, n_clusters=n_clusters).max(1)
def MCFS(X, y=None, **kwargs): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs) num_cluster = len(np.unique(y)) # obtain the feature weight matrix Weight = MCFS_CLASS.mcfs(X, n_selected_features=X.shape[1], W=W, n_clusters=num_cluster) # sort the feature scores in an ascending order according to the feature scores idx = MCFS_CLASS.feature_ranking(Weight) return idx
def select(dataset, features_number, clusters_number): app_logger.info( 'STARTED [MCFS Selection] on {0} with features number = {1}'.format( dataset, features_number), extra=LOGGER_EXTRA_OBJECT) # Retrieving all feature extracted by tsfresh from the pickles on the disk current_dir = os.getcwd().split('\\')[-1] projet_dir = 'MCFS-Unsupervisioned-Feature-Selection' if current_dir == projet_dir: all_features_train = pd.read_pickle( 'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( 'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) else: all_features_train = pd.read_pickle( '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) app_logger.info( 'All features (including target column) trainset shape: {0}'.format( all_features_train.shape), extra=LOGGER_EXTRA_OBJECT) app_logger.info( 'All features (including target column) testset shape: {0}'.format( all_features_test.shape), extra=LOGGER_EXTRA_OBJECT) # np.savetxt(r'testDataFrame.txt', all_features_test.values, fmt='%d') # Retrieving indipendent columns of both set and known labels of the test set indipendent_columns_train = all_features_train.iloc[:, 1:] indipendent_columns_test = all_features_test.iloc[:, 1:] known_labels_test = all_features_test.iloc[:, 0] # Building matrix W for MCFS algorithm kwargs = { 'metric': 'euclidean', 'neighbor_mode': 'knn', 'weight_mode': 'binary', 'k': 3 # 'weight_mode': 'heat_kernel', # 'k': 5, # 't': 1 } W = construct_W.construct_W(indipendent_columns_train.values, **kwargs) # MCFS gives a weight to each features kwargs = {'W': W, 'n_clusters': clusters_number} weighted_features = MCFS.mcfs(indipendent_columns_train.values, features_number, **kwargs) # Ordering the features according to their weight ordered_features = MCFS.feature_ranking(weighted_features) # Getting only the first 'features_number' features selected_features = ordered_features[0:features_number] # Getting names of selected features names_selected_features = [] for feature_index in selected_features: names_selected_features.append( indipendent_columns_train.columns[feature_index]) # Selected only the selected features on the train set selected_features_train = indipendent_columns_train.loc[:, names_selected_features] app_logger.info('Selected features trainset: {0}'.format( selected_features_train.shape), extra=LOGGER_EXTRA_OBJECT) # Selected only the selected features on the test set selected_features_test = indipendent_columns_test.loc[:, names_selected_features] app_logger.info('Selected features testset: {0}'.format( selected_features_test.shape), extra=LOGGER_EXTRA_OBJECT) ''' # Pickles for rfd if selected_features_train.shape[0] > 1000: print('Test-set') selected_features_test.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset)) else: print('Train-set') selected_features_train.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset)) exit() ''' # Running k-means according to selected features test_feature_selection.testFeatureSelectionWithRepeatedKMeans( 'MCFS', features_number, dataset, selected_features_train.values, selected_features_test.values, clusters_number, known_labels_test) app_logger.info('ENDED [MCFS Selection] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT) # Testing #select('TwoPatterns', 10, 4)
def generate_result_dist(dataset, x,y,num_select, zero_mean=False, N=1000, t=0.6, thresh=0.1): if zero_mean == False: x = normalize(x,axis=0) else: x = standardize_feature(x) n,d = x.shape if num_select==300: start_dim = 20; step = 20 elif num_select==200: # the dimension start_dim = 20; step = 10 elif num_select==100: start_dim = 10; step = 10 elif num_select==50: start_dim = 10; step = 5 elif num_select == 20: start_dim = 4; step = 2 else: start_dim = 5; step = 1 dimension_list = list(range(start_dim,num_select+1,step)) ######### rank: parameter preserve_pctg, num_use ######### D0 = compute_dist(x) preserve_pctg_list = [0.2,0.4,0.6,0.8,1] #dimension 0 num_use_list = [0.1,0.2,0.3,0.4,0.5] #dimension 1 rank_result = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)]) rank_result_l1 = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)]) rank_result_l2 = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)]) rank_result_lmax = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)]) for i,preserve_pctg in enumerate(preserve_pctg_list): for j,num_use in enumerate(num_use_list): print(i,j) rank_selected, rank_selected_l1, rank_selected_l2, rank_selected_lmax= ranking_selection(x, num_select, N=N, num_use=int(num_use*d+1),sample_pctg=1, preserve_pctg=preserve_pctg) rank_selected = list(rank_selected)[::-1] for k,dimension in enumerate(dimension_list): #performance using different number fo features s = rank_selected[:dimension] rank_x = x[:,s] D_rank = compute_dist(rank_x) rank_result[i,j,0,k] = ef.dif_dist(D0,D_rank,'l1') rank_result[i,j,1,k] = ef.dif_dist(D0,D_rank,'l2') rank_result[i,j,2,k] = ef.dif_dist(D0,D_rank,'lmax') s_l1 = rank_selected_l1[:dimension] rank_l1_x = x[:,s_l1] D1 = compute_dist(rank_l1_x) rank_result_l1[i,j,0,k] = ef.dif_dist(D0,D1,'l1') rank_result_l1[i,j,1,k] = ef.dif_dist(D0,D1,'l2') rank_result_l1[i,j,2,k] = ef.dif_dist(D0,D1,'lmax') s_l2 = rank_selected_l2[:dimension] rank_l2_x = x[:,s_l2] D2 = compute_dist(rank_l2_x) rank_result_l2[i,j,0,k] = ef.dif_dist(D0,D2,'l1') rank_result_l2[i,j,1,k] = ef.dif_dist(D0,D2,'l2') rank_result_l2[i,j,2,k] = ef.dif_dist(D0,D2,'lmax') s_lmax = rank_selected_lmax[:dimension] rank_lmax_x = x[:,s_lmax] D_max = compute_dist(rank_lmax_x) rank_result_lmax[i,j,0,k] = ef.dif_dist(D0,D_max,'l1') rank_result_lmax[i,j,1,k] = ef.dif_dist(D0,D_max,'l2') rank_result_lmax[i,j,2,k] = ef.dif_dist(D0,D_max,'lmax') np.save('./result/'+dataset+'/rank_dist',rank_result) np.save('./result/'+dataset+'/rank_l1_dist',rank_result_l1) np.save('./result/'+dataset+'/rank_l2_dist',rank_result_l2) np.save('./result/'+dataset+'/rank_lmax_dist',rank_result_lmax) ######## lap_score ########### lap_score_result = np.zeros([7,len(dimension_list)]) lap_score_selected = lap_score.lap_score(x) lap_score_selected = list(np.argsort(lap_score_selected)[:num_select]) #find minimum for k,dimension in enumerate(dimension_list): #performance using different number fo features s = lap_score_selected[:dimension] lap_score_x = x[:,s] D1 = compute_dist(lap_score_x) lap_score_result[0,k] = ef.dif_dist(D0,D1,'l1') lap_score_result[1,k] = ef.dif_dist(D0,D1,'l2') lap_score_result[2,k] = ef.dif_dist(D0,D1,'lmax') np.save('./result/'+dataset+'/lap_score_dist',lap_score_result) ######## SPEC ########### SPEC_result = np.zeros([7,len(dimension_list)]) SPEC_selected = SPEC.spec(x) SPEC_selected = list(np.argsort(SPEC_selected)[:num_select]) #find minimum for k,dimension in enumerate(dimension_list): #performance using different number fo features s = SPEC_selected[:dimension] SPEC_x = x[:,s] D1 = compute_dist(SPEC_x) SPEC_result[0,k] = ef.dif_dist(D0,D1,'l1') SPEC_result[1,k] = ef.dif_dist(D0,D1,'l2') SPEC_result[2,k] = ef.dif_dist(D0,D1,'lmax') np.save('./result/'+dataset+'/SPEC_dist',SPEC_result) ####### MCFS parameter: num_clusters ############## num_clusters_list = [5,10,20,30] MCFS_result = np.zeros([len(num_clusters_list),7,len(dimension_list)]) for i,num_clusters in enumerate(num_clusters_list): MCFS_W = MCFS.mcfs(x,num_select,**{'n_clusters':num_clusters}) MCFS_selected = [np.max(np.abs(x)) for x in MCFS_W] #find maximum MCFS_selected= np.argsort(MCFS_selected)[-num_select:] MCFS_selected = list(MCFS_selected)[::-1] for k,dimension in enumerate(dimension_list): #performance using different number fo features s = MCFS_selected[:dimension] MCFS_x = x[:,s] D1 = compute_dist(MCFS_x) MCFS_result[i,0,k] = ef.dif_dist(D0,D1,'l1') MCFS_result[i,1,k] = ef.dif_dist(D0,D1,'l2') MCFS_result[i,2,k] = ef.dif_dist(D0,D1,'lmax') np.save('./result/'+dataset+'/MCFS_dist',MCFS_result) return rank_result, rank_result_l1, rank_result_l2,rank_result_lmax,lap_score_result, SPEC_result, MCFS_result
def compare_methods(x,y,num_select,pctg=0.5,sample_pctg=1, num_clusters=5,zero_mean=False,dim=1,t=0.8,thresh=0.1): if zero_mean == False: x = normalize(x,axis=0) else: x = standardize_feature(x) n,d = x.shape # idx = np.random.permutation(n) # x,y = x[idx], y[idx] # # ######### split train and test ######### # X=x;Y=y # train_num = int(n*0.6) # test_num = n-int(n*0.6) # x=X[:train_num,:]; y=Y[:train_num] # x_test = X[-test_num:,:];y_test = Y[-test_num:] ########### calculate ###################### start_time = time.clock() rf_result = random_selection(x, num_select, N=500, num_use=int(0.5*d),pctg=pctg, two_sided=False) print('rf running time:',time.clock()-start_time) start_time = time.clock() rank_result,l1,l2,lmax= ranking_selection(x, num_select, N=500, num_use=int(0.5*d),sample_pctg=1, preserve_pctg=pctg) print('rank running time:',time.clock()-start_time) start_time = time.clock() lap_score_result = lap_score.lap_score(x) lap_score_result= np.argsort(lap_score_result)[:num_select] #find minimum print('lap_score running time:',time.clock()-start_time) start_time = time.clock() SPEC_result = SPEC.spec(x) print('SPEC running time:',time.clock()-start_time) SPEC_result= np.argsort(SPEC_result)[:num_select] #find minimum '''sparse learning based''' start_time = time.clock() MCFS_W = MCFS.mcfs(x,num_select,**{'n_clusters':num_clusters}) print('MCFS running time:',time.clock()-start_time) MCFS_result = [np.max(np.abs(x)) for x in MCFS_W] #find maximum MCFS_result= np.argsort(MCFS_result)[-num_select:] # start_time = time.clock() # NDFS_W = NDFS.ndfs(x,**{'n_clusters':num_clusters}) # print('NDFS running time:',time.clock()-start_time) # NDFS_result = [np.sqrt(np.sum(x**2)) for x in NDFS_W] #find maximum # NDFS_result= np.argsort(NDFS_result)[-num_select:] # # start_time = time.clock() # UDFS_W = UDFS.udfs(x,**{'n_clusters':num_clusters}) # print('UDFS running time:',time.clock()-start_time) # UDFS_result = [np.sqrt(np.sum(x**2)) for x in UDFS_W] #find minimum ?????????????????????? # UDFS_result= np.argsort(UDFS_result)[:num_select] # prop_x = x[:,list(stepwise)] rf_x = x[:,list(rf_result)] rank_x = x[:,list(rank_result)] l1_x = x[:,list(l1)] l2_x = x[:,list(l2)] lmax_x = x[:,list(lmax)] lap_score_x = x[:,list(lap_score_result)] SPEC_x = x[:,list(SPEC_result)] MCFS_x = x[:,list(MCFS_result)] # NDFS_x = x[:,list(NDFS_result)] # UDFS_x = x[:,list(UDFS_result)] # '''[KNN purity NMI dgm0 dgm1], each one is a matrix''' # methods = ['rf','rank','lap_score','SPEC','MCFS'] # for method in methods: # if method=='rf': # selected_feature = list(rf_result).reverse() # elif method=='rank': # selected_feature = list(rank_result).reverse() # elif method=='lap_score': # selected_feature = list(lap_score_result) # elif method=='SPEC': # selected_feature = list(SPEC_result) # else: # selected_feature = list(MCFS_result).reverse() # # if num_select<=50: # the dimension # start_dim = 5; step = 2 # else: # start_dim = 10; step = 5 print('KNN accuracy') print('rf', ef.knn_accuracy(x,y,rf_result)) print('rank', ef.knn_accuracy(x,y,rank_result)) print('l1', ef.knn_accuracy(x,y,l1)) print('l2', ef.knn_accuracy(x,y,l2)) print('lmax', ef.knn_accuracy(x,y,lmax)) print('lap_score', ef.knn_accuracy(x,y,lap_score_result)) print('SPEC', ef.knn_accuracy(x,y,SPEC_result)) print('MCFS',ef.knn_accuracy(x,y,MCFS_result)) # print('NDFS',ef.knn_accuracy(x_test,y_test,NDFS_result)) # print('UDFS',ef.knn_accuracy(x_test,y_test,UDFS_result),'\n') # print('connectivity') # print('rf', ef.connectivity(x,rf_x,pctg, two_sided)) # print('rank', ef.connectivity(x,rank_x,pctg, two_sided)) # print('lap_score', ef.connectivity(x,lap_score_x,pctg, two_sided)) # print('SPEC', ef.connectivity(x,SPEC_x,pctg, two_sided)) # print('cut-SPEC', ef.connectivity(x,CSPEC_x,pctg, two_sided)) # print('MCFS',ef.connectivity(x,MCFS_x,pctg, two_sided)) # print('NDFS',ef.connectivity(x,NDFS_x,pctg, two_sided)) # print('UDFS',ef.connectivity(x,UDFS_x,pctg, two_sided),'\n') print('purity score | NMI') print('origin', ef.purity_score(x,y)) print('rf', ef.purity_score(rf_x,y)) print('rank', ef.purity_score(rank_x,y)) print('lap_score', ef.purity_score(lap_score_x,y)) print('SPEC', ef.purity_score(SPEC_x,y) ) print('MCFS', ef.purity_score(MCFS_x,y)) dgm = ef.compute_dgm(x, t, dim, thresh) dgm_rf = ef.compute_dgm(rf_x, t, dim, thresh) dgm_rank = ef.compute_dgm(rank_x, t, dim, thresh) dgm_l1 = ef.compute_dgm(l1_x, t, dim, thresh) dgm_l2 = ef.compute_dgm(l2_x, t, dim, thresh) dgm_lmax = ef.compute_dgm(lmax_x, t, dim, thresh) dgm_lap_score = ef.compute_dgm(lap_score_x, t, dim, thresh) dgm_SPEC = ef.compute_dgm(SPEC_x, t, dim, thresh) dgm_MCFS = ef.compute_dgm(MCFS_x, t, dim, thresh) # plt.figure() # plt.plot(dgm[:,-2:], 'ro') # plt.figure() # plt.plot(dgm_rf[:,-2:], 'ro') # plt.figure() # plt.plot(dgm_rank[:,-2:], 'ro') # plt.figure() # plt.plot(dgm_SPEC[:,-2:], 'ro') # plt.figure() # plt.plot(dgm_MCFS[:,-2:], 'ro') print('dgm distance') print('rf', ef.dgm_distance(dgm,dgm_rf,'W', dim),' ',ef.dgm_distance(dgm,dgm_rf,'B', dim)) print('rank', ef.dgm_distance(dgm,dgm_rank,'W', dim),' ',ef.dgm_distance(dgm,dgm_rank,'B', dim)) print('l1', ef.dgm_distance(dgm,dgm_l1,'W', dim),' ',ef.dgm_distance(dgm,dgm_l1,'B', dim)) print('l2', ef.dgm_distance(dgm,dgm_l2,'W', dim),' ',ef.dgm_distance(dgm,dgm_l2,'B', dim)) print('lmax', ef.dgm_distance(dgm,dgm_lmax,'W', dim),' ',ef.dgm_distance(dgm,dgm_lmax,'B', dim)) print('lap_score', ef.dgm_distance(dgm,dgm_lap_score,'W', dim),' ',ef.dgm_distance(dgm,dgm_lap_score,'B', dim)) print('SPEC', ef.dgm_distance(dgm,dgm_SPEC,'W', dim),' ',ef.dgm_distance(dgm,dgm_SPEC,'B', dim)) print('MCFS', ef.dgm_distance(dgm,dgm_MCFS,'W', dim),' ',ef.dgm_distance(dgm,dgm_MCFS,'B', dim))
def compare_methods(x, y, num_select, pctg=0.1, pack_size=1, num_clusters=5, two_sided=False): n, d = x.shape idx = np.random.permutation(n) x, y = x[idx], y[idx] ######### split train and test ######### X = x Y = y train_num = int(n * 0.7) test_num = n - int(n * 0.7) x = X[:train_num, :] y = Y[:train_num] x_test = X[-test_num:, :] y_test = Y[-test_num:] ########### other methods ###################### ''' Similarity based: lap_score SPEC ''' start_time = time.clock() lap_score_result = lap_score.lap_score(x) lap_score_result = np.argsort(lap_score_result)[:num_select] print('lap_score running time:', time.clock() - start_time) # _,stepwise = backward_distance_selection(x,num_select,pctg,pack_size) #pctg controls sensitivity to outliers start_time = time.clock() rf_result = random_selection(x, num_select, N=300, num_use=int(d / 2), pctg=pctg, two_sided=two_sided) print('rf running time:', time.clock() - start_time) start_time = time.clock() SPEC_result = SPEC.spec(x) print('SPEC running time:', time.clock() - start_time) SPEC_result = np.argsort(SPEC_result)[:num_select] #find minimum start_time = time.clock() CSPEC_result = cut_spec(x, pctg=0.15) print('cut-SPEC running time:', time.clock() - start_time) CSPEC_result = np.argsort(CSPEC_result)[:num_select] #find minimum '''sparse learning based''' start_time = time.clock() MCFS_W = MCFS.mcfs(x, num_select) print('MCFS running time:', time.clock() - start_time) MCFS_result = [np.max(np.abs(x)) for x in MCFS_W] #find maximum MCFS_result = np.argsort(MCFS_result)[-num_select:] # start_time = time.clock() # NDFS_W = NDFS.ndfs(x,**{'n_clusters':num_clusters}) # print('NDFS running time:',time.clock()-start_time) # NDFS_result = [np.sqrt(np.sum(x**2)) for x in NDFS_W] #find maximum # NDFS_result= np.argsort(NDFS_result)[-num_select:] # # start_time = time.clock() # UDFS_W = UDFS.udfs(x,**{'n_clusters':num_clusters}) # print('UDFS running time:',time.clock()-start_time) # UDFS_result = [np.sqrt(np.sum(x**2)) for x in UDFS_W] #find minimum ?????????????????????? # UDFS_result= np.argsort(UDFS_result)[:num_select] # prop_x = x[:,list(stepwise)] rf_x = x[:, list(rf_result)] lap_score_x = x[:, list(lap_score_result)] SPEC_x = x[:, list(SPEC_result)] CSPEC_x = x[:, list(CSPEC_result)] MCFS_x = x[:, list(MCFS_result)] # NDFS_x = x[:,list(NDFS_result)] # UDFS_x = x[:,list(UDFS_result)] print('\n') print('Class Seperability') # print('prop', ef.class_seperability(prop_x,y)) print('rf', ef.class_seperability(rf_x, y)) print('lap_score', ef.class_seperability(lap_score_x, y)) print('SPEC', ef.class_seperability(SPEC_x, y)) print('cut-SPEC', ef.class_seperability(CSPEC_x, y)) print('MCFS', ef.class_seperability(MCFS_x, y)) # print('NDFS',ef.class_seperability(NDFS_x,y)) # print('UDFS',ef.class_seperability(UDFS_x,y)) print('\n') print('KNN accuracy') # print('prop', ef.knn_accuracy(prop_x,y)) print('rf', ef.knn_accuracy(x_test, y_test, rf_result)) print('lap_score', ef.knn_accuracy(x_test, y_test, lap_score_result)) print('SPEC', ef.knn_accuracy(x_test, y_test, SPEC_result)) print('cut-SPEC', ef.knn_accuracy(x_test, y_test, CSPEC_result)) print('MCFS', ef.knn_accuracy(x_test, y_test, MCFS_result)) # print('NDFS',ef.knn_accuracy(x_test,y_test,NDFS_result)) # print('UDFS',ef.knn_accuracy(x_test,y_test,UDFS_result),'\n') print('\n') print('connectivity') # print('prop', ef.knn_accuracy(prop_x,y)) print('rf', ef.connectivity(x, rf_x, pctg, two_sided)) print('lap_score', ef.connectivity(x, lap_score_x, pctg, two_sided)) print('SPEC', ef.connectivity(x, SPEC_x, pctg, two_sided)) print('cut-SPEC', ef.connectivity(x, CSPEC_x, pctg, two_sided)) print('MCFS', ef.connectivity(x, MCFS_x, pctg, two_sided))
data = np.loadtxt("./data/GaussianTopologyNode.txt") edge = np.loadtxt("./data/GaussianTopologyEdge.txt") timeStart = datetime.datetime.now() if useEdge: W = ConstructWbyEdge.ConstructWbyEdge(data, edge, t=1) else: kwrags_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 3, "t": 1 } W = construct_W(data, **kwrags_W) result = MCFS.mcfs(data, n_selected_features=2, W=W, n_clusters=2) print result timeEnd = datetime.datetime.now() print "Run Time: ", timeEnd - timeStart elif dataSet == 1: data = np.loadtxt("./data/SwissRollTopologyNode.txt") edge = np.loadtxt("./data/SwissRollTopologyEdge.txt") timeStart = datetime.datetime.now() if useEdge: W = ConstructWbyEdge.ConstructWbyEdge(data, edge, t=1) else: kwrags_W = { "metric": "euclidean",
def mcfs(train_set, test_set, features_number, clusters_number): # Features to delete features_to_delete = [] for i in range(3, len(sys.argv)): features_to_delete.append(sys.argv[i]) # Retrieving indipendent columns of both set and known labels of the test set indipendent_columns_train = train_set.iloc[:, 1:] indipendent_columns_test = test_set.iloc[:, 1:] known_labels_test = test_set.iloc[:, 0] # Building matrix W for MCFS algorithm kwargs = { 'metric': 'euclidean', 'neighbor_mode': 'knn', 'weight_mode': 'binary', 'k': 3 } W = construct_W.construct_W(indipendent_columns_train.values, **kwargs) # MCFS gives a weight to each features kwargs = {'W': W, 'n_clusters': clusters_number} weighted_features = MCFS.mcfs(indipendent_columns_train.values, features_number, **kwargs) # Ordering the features according to their weight ordered_features = MCFS.feature_ranking(weighted_features) # Getting only the first 'features_number' features selected_features = ordered_features[0:features_number] # Getting names of selected features names_selected_features = [] for feature_index in selected_features: names_selected_features.append( indipendent_columns_train.columns[feature_index]) # Deleting "feature to delete" names_selected_features = [ feature for feature in names_selected_features if feature not in features_to_delete ] if len(names_selected_features ) != len(selected_features) - len(features_to_delete): kmeans_rfd_logger.error( 'One or more feature "to delete" is/are not correct.') else: # Selected only the selected features on the train set selected_features_train = indipendent_columns_train.loc[:, names_selected_features] # Selected only the selected features on the test set selected_features_test = indipendent_columns_test.loc[:, names_selected_features] kmeans_rfd_logger.info( '(Deleted features: {0})'.format(features_to_delete)) # Running k-means according to selected features run_kmeans(len(names_selected_features), selected_features_train.values, selected_features_test.values, clusters_number, known_labels_test)