def get_feature_scaling(graphs, targets, decomposition_funcs=None, preprocessors=None, nbits=11, threshold=0.25): x = vectorize(graphs, decomposition_funcs, preprocessors=preprocessors, nbits=nbits, seed=1) estimator = SGDClassifier(penalty='elasticnet', tol=1e-3) fs = RFECV(estimator, step=.1, cv=3) fs.fit(x, targets) fs.estimator_.decision_function(fs.transform(x)).reshape(-1) importances = fs.inverse_transform(fs.estimator_.coef_).reshape(-1) signs = np.sign(importances) importances = np.absolute(importances) importances = importances / np.max(importances) # non linear thresholding to remove least important features th = np.percentile(importances, threshold * 100) signs[importances < th] = 0 importances[importances < th] = 0 return importances, signs
def feature_importance(self, pos_graphs, neg_graphs): graphs = pos_graphs + neg_graphs y = [1] * len(pos_graphs) + [-1] * len(neg_graphs) x = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) estimator = SGDClassifier(penalty='elasticnet', tol=1e-3) fs = RFECV(estimator, step=.1, cv=3) fs.fit(x, y) fs.estimator_.decision_function(fs.transform(x)).reshape(-1) self.estimator = fs.estimator_ importances = fs.inverse_transform(fs.estimator_.coef_).reshape(-1) intercept = fs.estimator_.intercept_[0] importance_dict = dict(enumerate(importances)) return importance_dict, intercept
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): # Store the original feature list and normalize the data list_temp = self.feature_list scaler = StandardScaler() X_minmax = scaler.fit_transform(X) self.X_minmax = copy.deepcopy(X_minmax) self.scores = [] # Determine the number of folds to be used. kfold = StratifiedKFold(n_splits=5, shuffle=True) for outer in range(self.outer_loop): print("\n--------This is outer loop {}---------\n".format(outer + 1)) # Run the outer loop from here for i, (train_o, test_o) in enumerate(kfold.split(X_minmax, y)): self.loop_indices.append((train_o, test_o)) print("This is set {}".format(i + 1)) X_train_o = X_minmax[train_o] y_train_o = y[train_o] X_test_o = X_minmax[test_o] y_test_o = y[test_o] X_train_transformed = copy.deepcopy(X_train_o) X_test_transformed = copy.deepcopy(X_test_o) # Run the inner loop from here for inner in range(self.inner_loop): # If the number of features are very high (>100), we set the minimum number of features needed to be 100. # If the numnber of features are moderate (15-100), we set the minimum number of features to be 10 # less than already present n_feat = min(100, X_train_transformed.shape[1] - 10) # If the number of features are less (<15), then we want it to select atleast 5 features to continue the loop n_feat = max(10, n_feat) list_temp_prev = list_temp print("\n\t--------This is inner loop {}---------\n".format(inner + 1)) rfecv = RFECV(estimator=self.clf, step=1, min_features_to_select=n_feat, cv=kfold, scoring='accuracy') # rfecv = xgb.XGBClassifier() # Transform the datasets at each loop to keep track of reduced features # rfecv.fit(X_train_transformed, y_train_o) # X_train_transformed = rfecv.transform(X_train_transformed) X_train_transformed = rfecv.fit_transform(X_train_transformed, y_train_o) self.models.append(rfecv) X_test_transformed = rfecv.transform(X_test_transformed) X_minmax = rfecv.transform(X_minmax) features = rfecv.n_features_ print("\tShape of transformed train dataset is: {}".format(X_train_transformed.shape)) print("\tOptimal no. of features are: {}".format(features)) ranking = rfecv.ranking_ # Update the feature list here list_temp = self.updateFeatures(list_temp_prev, ranking) # This is just used to check the score after inner loop is finished as the test data was already transformed # to reduced features. Hence we inverse the transform to check the score X_temp = rfecv.inverse_transform(X_test_transformed) score = rfecv.score(X_temp, y_test_o) self.scores.append(score) print("Shape of transformed train dataset is: {}".format(X_train_transformed.shape)) print("Shape of ranks is: {}\n\n".format(ranking.shape)) # Print the average scores after finshing the outer loop and save the features in an excel file print("After outer loop CV, mean score is: {}".format(mean(self.scores))) self.list = list_temp_prev self.ranking = ranking print(X_train_transformed.shape) print(X_test_transformed.shape) self.X_transformed = np.vstack((X_train_transformed, X_test_transformed)) return self
f"acc_train = {acc_train:.3f}; f1score_train = {f1score_train}\nacc_validation = {acc_validation:.8f}; f1score_validaton = {f1score_validation}" ) #%% ============================最终的测试============================ # 最好使用外部测试集 pred_test_label = model.predict(feature_test_) pred_test_prob = model.decision_function(feature_test_) acc_test = metrics.accuracy_score(label_test, pred_test_label) f1score_test = metrics.f1_score(label_test, pred_test_label) print(f"acc_test = {acc_test:.8f}; f1score_test = {f1score_test}\n") #%% ============================结果可视化============================ # 获取权重 wei = model.coef_ wei = (wei - wei.mean()) / wei.std() wei = selector.inverse_transform(wei) wei = pca.inverse_transform(wei) weight = np.zeros(mask.shape) weight[mask] = wei[0] weight = weight + weight.T # 只显示前0.2%的权重 threshold = 99.8 topperc = np.percentile(np.abs(weight), threshold) weight[np.abs(weight) < topperc] = 0 # 获取MNI坐标 coords_file = r"F:\workshop\demo_data\BNA_subregions.xlsx" coords_info = pd.read_excel(coords_file) # 获取MNI坐标