def selectFeatureLapScore(filename, num_feature, num_cluster): # Recupero del pickle salvato su disco con i sample e TUTTE le feature estratte da TSFresh. SU QUESTO LAVOREREMO NOI all_features_train = pd.read_pickle( "./pickle/feature_complete/TRAIN/{0}_TRAIN_FeatureComplete.pkl".format( filename)) all_features_test = pd.read_pickle( "./pickle/feature_complete/TEST/{0}_TEST_FeatureComplete.pkl".format( filename)) # Elimino colonne con valori NaN all_features_train = all_features_train.dropna(axis=1) all_features_test = all_features_test.dropna(axis=1) # Costruisco matrice W da dare a NDFS kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(all_features_train.values, **kwargs_W) # Esecuzione dell'algoritmo NDFS. Otteniamo il peso delle feature per cluster. featurePesate = lap_score.lap_score(all_features_train.values, W=W) # ordinamento delle feature in ordine discendente idx = lap_score.feature_ranking(featurePesate) idxSelected = idx[0: num_feature] # seleziono il numero di feature che voglio # Estraggo i nomi delle feature che ho scelto nomiFeatureSelezionate = [] for i in idxSelected: nomiFeatureSelezionate.append(all_features_train.columns[i]) # Creo il dataframe con solo le feature che ho selezionato dataframeFeatureSelezionate = all_features_train.loc[:, nomiFeatureSelezionate] # Aggiusto anche il dataset di test con solo le feature scelte all_features_test = all_features_test.loc[:, nomiFeatureSelezionate] # Estraggo le classi conosciute labelConosciute = estrattoreClassiConosciute.estraiLabelConosciute( "./UCRArchive_2018/{0}/{0}_TEST.tsv".format(filename)) # K-means su feature selezionate print("\nRisultati con feature selezionate da noi con Lap Score") print("Numero feature: {0}".format(all_features_test.shape[1])) testFeatureSelection(X_selected=dataframeFeatureSelezionate.values, X_test=all_features_test.values, num_clusters=num_cluster, y=labelConosciute)
def SKF_lap(X, y): # construct affinity matrix kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) return lap_score.feature_ranking(score)
def get_lap_score(data, k=5, t=1,top_feature = 30): kwargs_W = {"metric":"euclidean","neighbor_mode":"knn","weight_mode":"heat_kernel","k":k,'t':t} W = construct_W.construct_W(data, **kwargs_W) score = lap_score.lap_score(data, W=W) #print(score) ranking = lap_score.feature_ranking(score) #print(idx) dfscores = pd.DataFrame(score) dfcolumns = pd.DataFrame(data.columns) #df_rank = pd.DataFrame(idx) featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Feature','Score'] #naming the dataframe columns #print(featureScores.nlargest(k,'Score')) #print 20 best features result = featureScores.nlargest(top_feature,'Score') return result, ranking
def laplacian_score(X, y=None, **kwargs): # construct affinity matrix kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) return idx
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_w.construct_w(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print('NMI:', old_div(float(nmi_total), 20)) print('ACC:', old_div(float(acc_total), 20))
def lap_score_filtering(self, vt_data, num_features): vt_numpy = vt_data.to_numpy() # construct affinity matrix kwargs_W = { "metric": "cosine", "neighbor_mode": "knn", "weight_mode": "cosine", "k": 40, 't': 500 } print( "We perform Laplacian score filtering using the following parameters: " + str(kwargs_W)) W = construct_W.construct_W(vt_numpy, **kwargs_W) score = lap_score.lap_score(vt_numpy, W=W) idx = lap_score.feature_ranking(score) # rank features filtered_data = vt_data.iloc[:, idx[0:num_features]].copy() print("\nThe data now has " + str(len(filtered_data.T)) + " features after Laplacian score filtering.") return filtered_data
def main(): # load data mat = scipy.io.loadmat("../data/COIL20.mat") X = mat["X"] # data X = X.astype(float) y = mat["Y"] # label y = y[:, 0] # construct affinity matrix kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, "t": 1} W = construct_W.construct_W(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print "NMI:", float(nmi_total) / 20 print "ACC:", float(acc_total) / 20
def predict(self, X): """ :param X: shape [n_row*n_clm, n_band] :return: """ # n_row, n_column, __n_band = X.shape # XX = X.reshape((n_row * n_column, -1)) # n_sample * n_band XX = X kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(XX, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:self.n_band]] # selected_features.reshape((self.n_band, n_row, n_column)) # selected_features = np.transpose(selected_features, axes=(1, 2, 0)) return selected_features
y_train, y_test = labels[train_index], labels[test_index] start_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) acc = [] # lap_score method = 'lap_score' kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X_train, **kwargs_W) score = lap_score.lap_score(X_train, W=W) idx = lap_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # fisher_score score = fisher_score.fisher_score(X_train, y_train) idx = fisher_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # reliefF score = reliefF.reliefF(X_train, y_train)
def bench(self, X, X_norm, y, n=2): num_feats = 20 output_data = {'method': list(), 'features': list(), 'time': list(), self.test_att: list(), 'supervised': list()} # ---------------------------------------------------------------- # CFS # start = time.perf_counter() # idx = cfs(X_norm.to_numpy(), y.to_numpy())[0] # print(idx) # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() # output_data['method'].append('CFS') # output_data['time'].append(time.perf_counter() - start) # output_data['features'].append(selected_features) # output_data[self.test_att].append(self.train_real_data(selected_features, X)) # LA: Laplacian Score start = time.perf_counter() kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(X_norm.to_numpy(), **kwargs_W) score = lap_score.lap_score(X_norm.to_numpy(), W=W) idx = lap_score.feature_ranking(score) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('Laplacian Score') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # FCBF: Feature correlation based filter # start = time.perf_counter() # idx = fcbf(X_norm.to_numpy(), y.to_numpy(), n_selected_features=num_feats)[0] # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() # output_data['method'].append('FCBF') # output_data['time'].append(time.perf_counter() - start) # output_data['features'].append(selected_features) # output_data['supervised'].append(True) # output_data[self.test_att].append(self.train_real_data(selected_features, X)) # print(output_data) # output_data['method'].append('FCBF') # output_data['time'].append(9999999) # output_data['features'].append([]) # output_data['supervised'].append(True) # output_data[self.test_att].append(0.0) # UDFS: Unsupervised Discriminative Feature Selection start = time.perf_counter() Weight = udfs(X_norm.to_numpy(), gamma=0.1, n_clusters=n) idx = feature_ranking(Weight) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('UDFS') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # SPEC: Spectral Feature Selection start = time.perf_counter() score = spec(X_norm.to_numpy()) idx = feature_ranking_spec(score) selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist() output_data['method'].append('SPEC') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(selected_features) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(selected_features, X)) print(output_data) # Mrmr: minimum redundency maximum relevance start = time.perf_counter() mrmr = pymrmr.mRMR(X_norm, 'MIQ', num_feats) output_data['method'].append('MRMR(MIQ)') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(mrmr) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(mrmr, X)) print(output_data) # Mrmr: minimum redundency maximum relevance start = time.perf_counter() mrmr = pymrmr.mRMR(X_norm, 'MID', num_feats) output_data['method'].append('MRMR(MID)') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(mrmr) output_data['supervised'].append(False) output_data[self.test_att].append(self.train_real_data(mrmr, X)) print(output_data) # recursive feature elimination(RFE): from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5) start = time.perf_counter() rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X_norm.loc[:, rfe_support].columns.tolist() output_data['method'].append('RFE') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(rfe_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(rfe_feature, X)) print(output_data) # ---------------------------------------------------------------- # Lasso: SelectFromModel: from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats) start = time.perf_counter() embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X_norm.loc[:, embeded_lr_support].columns.tolist() output_data['method'].append('Lasso') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(embeded_lr_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(embeded_lr_feature, X)) print(output_data) print(str(len(embeded_lr_feature)), 'selected features') # ----------------------------------------------------------------------------- # Tree - based: SelectFromModel: from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats) start = time.perf_counter() embeded_rf_selector.fit(X_norm, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = X_norm.loc[:, embeded_rf_support].columns.tolist() output_data['method'].append('Tree_Based_RF') output_data['time'].append(time.perf_counter() - start) output_data['features'].append(embeded_rf_feature) output_data['supervised'].append(True) output_data[self.test_att].append(self.train_real_data(embeded_rf_feature, X)) print(output_data) print(str(len(embeded_rf_feature)), 'selected features') # ------------------------------------------------------------------------------- # also tree based: from sklearn.feature_selection import SelectFromModel from lightgbm import LGBMClassifier lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2, reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40) embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats) start = time.perf_counter() embeded_lgb_selector.fit(X_norm, y) embeded_lgb_support = embeded_lgb_selector.get_support() embeded_lgb_feature = X_norm.loc[:, embeded_lgb_support].columns.tolist() output_data['method'].append('Tree_Based_lightGBM') output_data['time'].append(time.perf_counter() - start) output_data['supervised'].append(True) output_data['features'].append(embeded_lgb_feature) output_data[self.test_att].append(self.train_real_data(embeded_lgb_feature, X)) print(output_data) print(str(len(embeded_lgb_feature)), 'selected features') return output_data
########################### Apply Feature Selection methods :ReliefF, Laplacian score & Fisher #ReliefF score_rel = reliefF.reliefF(X_train, y_train) idx_rel = reliefF.feature_ranking(score_rel) #Laplacian score kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "k": 7, 't': 1, 'reliefF': True } W = construct_W.construct_W(X_train, **kwargs_W) score_lap = lap_score.lap_score(X_train, W=W) idx_lap = lap_score.feature_ranking(score_lap) #Fisher score_fish = fisher_score.fisher_score(X_train, y_train) print(score_fish) idx_fish = fisher_score.feature_ranking(score_fish) ###################################### Feature Integration idxM = idx_rel[:threshold] idxN = idx_lap[:threshold] idxO = idx_fish[:threshold] if combination_method == 1: #AND idx_and = reduce(np.intersect1d, (idxO, idxM, idxN)) idx = idx_and print("number of selectes features (bins) = ", idx.shape[0])