def Attribut_rank(model): data = pd.read_csv('german_credit.csv') # print(data.describe()) X = data.drop(['default'],axis = 1) lable = data['default'] df = Data_Numeric.Data_numerique(X) data = DataDeal.get_data(df,lable) Train_data,test = train_test_split(data, test_size=0.2,random_state=42) x_test = test[:,:-1] y_test = test[:,-1] x_train = Train_data[:,:-1] y_train = Train_data[:,-1] if model=='LSFSVM': kernel_dict = {'type': 'RBF','sigma':0.717} fuzzyvalue = {'type':'Cen','function':'Lin'} #clf = FSVM.FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5) clf = LS_FSVM.LSFSVM(10,kernel_dict, fuzzyvalue,'origine',4/5) m = clf._mvalue(x_train, y_train) elif model=='FSVM': kernel_dict = {'type': 'RBF','sigma':0.717} fuzzyvalue = {'type':'Cen','function':'Lin'} clf = FSVM.FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5) #clf = LS_FSVM.LSFSVM(10,kernel_dict, fuzzyvalue,'origine',4/5) m = clf._mvalue(x_train, y_train) elif model=='SVM': clf = svm.SVC() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) auc_complete = roc_auc_score(y_test, y_pred) #print(X.columns) AUC = [] for col in X.columns: #Only delete one attribut # X_r = X.drop([col],axis=1) # df = Data_Numeric.Data_numerique(X_r) # data = DataDeal.get_data(df,lable) # print(df.columns) #Use only one attribut X_r = pd.DataFrame(X[col]) lable[lable == 0] = -1 df = Data_Numeric.Data_numerique(X_r) X_r = np.array(df) min_max_scaler = preprocessing.MinMaxScaler() X_r = min_max_scaler.fit_transform(X_r) data = np.append(X_r,lable[:,None],axis=1) # print(df.columns) # Train_data,test = train_test_split(data, test_size=0.2,random_state=42) x_test = test[:,:-1] y_test = test[:,-1] x_train = Train_data[:,:-1] y_train = Train_data[:,-1] if model=='LSFSVM': kernel_dict = {'type': 'RBF','sigma':0.717} fuzzyvalue = {'type':'Cen','function':'Lin'} clf = LS_FSVM.LSFSVM(10,kernel_dict, fuzzyvalue,'origine',4/5) m = clf._mvalue(x_train, y_train) elif model=='FSVM': kernel_dict = {'type': 'RBF','sigma':0.717} fuzzyvalue = {'type':'Cen','function':'Lin'} clf = FSVM.FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5) m = clf._mvalue(x_train, y_train) elif model=='SVM': clf = svm.SVC() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) auc = roc_auc_score(y_test, y_pred) AUC.append(auc) # print(col , ':', auc) indices = np.argsort(AUC)[::-1] featurerank=[] for f in range(len(indices)): featurerank.append(X.columns[indices[f]]) print('AUC complete',auc_complete) plt.figure(figsize=(10,8)) feature_imp = pd.Series(AUC,index=X.columns).sort_values(ascending=False) sns.barplot(x= feature_imp,y=feature_imp.index) #plt.vlines(auc_complete,feature_imp.index[19], feature_imp.index[0]) #plt.xlim((0.65, 0.8)) plt.xlim((0.4, 0.7)) plt.xlabel('Feature Importance Score_AUC') plt.ylabel('Features') plt.title("Visualizing Important Features for SVM") plt.legend() plt.show()
if judgment == 'Acc': score = len(y_test[y_predict == y_test]) / test_length elif judgment == 'AUC': score = roc_auc_score(y_test, y_predict) score_memory.append(score) if score > best_score: best_score = score best_parameter = [C, d] print('kernel_dict:', kernel_dict_type) print('best_parameter', best_parameter) return best_parameter if __name__ == '__main__': x_train, y_train, x_test, y_test = DataDeal.get_data() fuzzyvalue = {'type': 'Cen', 'function': 'Lin'} param_grid = { 'C': np.logspace(0, 1, 50), 'sigma': np.logspace(-2, 0.5, 50) } C = LS_FSVM_best(x_train, y_train, 'LINEAR', param_grid, 'AUC', fuzzyvalue, 3 / 4, 1) kernel_dict = {'type': 'LINEAR'} clf = LS_FSVM.LSFSVM(C, kernel_dict, fuzzyvalue, 3 / 4) clf._mvalue(x_train, y_train) clf.fit(x_train, y_train) y_predict = clf.predict(x_test)
y_prob[i] = round(y_prob[i],3) return y_prob def decision_function(self, X): return self.y_predict # Test Code for _LSSVMtrain if __name__ == '__main__': data = DataDeal.get_data('german_numerical.csv') Train_data,test = train_test_split(data, test_size=0.2) x_test = test[:,:-1] y_test = test[:,-1] x_train = Train_data[:,:-1] y_train = Train_data[:,-1] kernel_dict = {'type': 'RBF','sigma':0.717} fuzzyvalue = {'type':'Cen','function':'Lin'} clf = FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5) m = clf._mvalue(x_train, y_train) clf.fit(x_train, y_train) y_pred = clf.predict(x_test)
y_prob[i] = round(y_prob[i], 3) return y_prob def decision_function(self, X): return self.y_predict # Test Code for _LSSVMtrain if __name__ == '__main__': data = pd.read_csv('DF4.csv') X = data.drop(['default'], axis=1) label = data['default'] data = DataDeal.get_data(X, label, 'normaliser', scaler='True') x = data[:, :-1] y = data[:, -1] Train_data, test = train_test_split(data, test_size=0.2, random_state=42) x_test = test[:, :-1] y_test = test[:, -1] x_train = Train_data[:, :-1] y_train = Train_data[:, -1] # ss=StratifiedShuffleSplit(n_splits=3,test_size=0.2,train_size=0.8, random_state=0) # for train_index, test_index in ss.split(x, y): # x_train, x_test = x[train_index,:], x[test_index,:]#训练集对应的值 # y_train, y_test = y[train_index], y[test_index]#类别集对应的值
def IV_plot(model): data = pd.read_csv('german_credit.csv') # print(data.describe()) X = data.drop(['default'], axis=1) if model == 'Origine': Y = data['default'].copy() Y = Y - 1 Y[Y == -1] = 1 elif model == 'FSVM': lable = data['default'] df = Data_Numeric.Data_numerique(X) data = DataDeal.get_data(df, lable) x = data[:, :-1] with open('save/FSVM_Cen_Lin_RBF_Origine.pickle', 'rb') as f: clf = pickle.load(f) y_pred = clf.predict(x) y = y_pred.copy() y[y == -1] = 0 y = y.astype('int64') y_df = pd.DataFrame({"Yp": y}) Y = y_df['Yp'] elif model == 'LSFSVM': lable = data['default'] df = Data_Numeric.Data_numerique(X) data = DataDeal.get_data(df, lable) x = data[:, :-1] with open('save/LSFSVM_Cen_Lin_RBF_Origine.pickle', 'rb') as f: clf = pickle.load(f) y_pred = clf.predict(x) y = y_pred.copy() y[y == -1] = 0 y = y.astype('int64') y_df = pd.DataFrame({"Yp": y}) Y = y_df['Yp'] elif model == 'LSFSVM_bagging': lable = data['default'] df = Data_Numeric.Data_numerique(X) data = DataDeal.get_data(df, lable) x = data[:, :-1] with open('save/LSFSVMbag_Cen_Lin_RBF_Origine.pickle', 'rb') as f: clf = pickle.load(f) y_pred = clf.predict(x) y = y_pred.copy() y[y == -1] = 0 y = y.astype('int64') y_df = pd.DataFrame({"Yp": y}) Y = y_df['Yp'] elif model == 'FSVM_bagging': lable = data['default'] df = Data_Numeric.Data_numerique(X) data = DataDeal.get_data(df, lable) x = data[:, :-1] with open('save/FSVMbag_Cen_Lin_RBF_Origine.pickle', 'rb') as f: clf = pickle.load(f) y_pred = clf.predict(x) y = y_pred.copy() y[y == -1] = 0 y = y.astype('int64') y_df = pd.DataFrame({"Yp": y}) Y = y_df['Yp'] badnum = len(Y[Y == 0]) # amount of bad clients goodnum = Y.count() - badnum # amount of good clients def self_bin_object(X): d1 = pd.DataFrame({ "X": X, "Y": Y, "Bucket": X }) #create a DateFrame X-- attribut , Y--label , Bucket--each binning d2 = d1.groupby( 'Bucket', as_index=True) # Group and aggregate according to binning results d3 = pd.DataFrame(d2.count(), columns=['good']) d3['good'] = d2.sum().Y d3['total'] = d2.count().Y d3['bad'] = d3['total'] - d3['good'] d3['rate'] = d2.mean().Y d3['woe'] = np.log( (d3['bad'] / badnum) / (d3['good'] / goodnum)) # calcuate WOE of each binning d3['badattr'] = d3[ 'bad'] / badnum # distribution of bad clients in each binning d3['goodattr'] = d3[ 'good'] / goodnum # distribution of good clients in each binning iv = ((d3['badattr'] - d3['goodattr']) * d3['woe']).sum() # calculate Information VAlue d4 = (d3.sort_index(by='good')).reset_index(drop=True) # ranking woe = list(d4['woe'].round(3)) return iv, d3, woe def self_bin_numeric(X, cut): d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, cut)}) d2 = d1.groupby('Bucket', as_index=True) d3 = pd.DataFrame(d2.count(), columns=['good']) d3['good'] = d2.sum().Y d3['total'] = d2.count().Y d3['bad'] = d3['total'] - d3['good'] d3['rate'] = d2.mean().Y d3['woe'] = np.log((d3['bad'] / badnum) / (d3['good'] / goodnum)) d3['badattr'] = d3['bad'] / badnum d3['goodattr'] = d3['good'] / goodnum iv = ((d3['badattr'] - d3['goodattr']) * d3['woe']).sum() # d4 = (d3.sort_index(by ='good')).reset_index(drop=True) d4 = (d3.sort_index(axis=1)).reset_index(drop=True) woe = list(d4['woe'].round(3)) return iv, d3, woe iv_fw = self_bin_object(X['foreign_worker'])[0] iv_acs = self_bin_object(X['account_check_status'])[0] iv_ch = self_bin_object(X['credit_history'])[0] iv_pur = self_bin_object(X['purpose'])[0] iv_sav = self_bin_object(X['savings'])[0] iv_pes = self_bin_object(X['present_emp_since'])[0] iv_pss = self_bin_object(X['personal_status_sex'])[0] iv_od = self_bin_object(X['other_debtors'])[0] iv_pro = self_bin_object(X['property'])[0] iv_oip = self_bin_object(X['other_installment_plans'])[0] iv_hous = self_bin_object(X['housing'])[0] iv_job = self_bin_object(X['job'])[0] iv_tele = self_bin_object(X['telephone'])[0] iv_iaip = self_bin_object(X['installment_as_income_perc'])[0] iv_prs = self_bin_object(X['present_res_since'])[0] iv_ctb = self_bin_object(X['credits_this_bank'])[0] iv_pum = self_bin_object(X['people_under_maintenance'])[0] iv_dim = self_bin_numeric(X['duration_in_month'], 4)[0] iv_ca = self_bin_numeric(X['credit_amount'], 5)[0] iv_age = self_bin_numeric(X['age'], 5)[0] # print(self_bin_object(X['credits_this_bank'])) # print(self_bin_numeric(X['duration_in_month'],4)) IV = [iv_acs,iv_dim,iv_ch,iv_pur,iv_ca,iv_sav,iv_pes,iv_iaip,iv_pss,iv_od,iv_prs,\ iv_pro,iv_age,iv_oip,iv_hous,iv_ctb,iv_job,iv_pum,iv_tele,iv_fw] indices = np.argsort(IV)[::-1] featurerank = [] for f in range(len(indices)): featurerank.append(X.columns[indices[f]]) plt.figure(figsize=(10, 8)) feature_imp = pd.Series(IV, index=X.columns).sort_values(ascending=False) ivlist = pd.Series(IV).sort_values(ascending=False) ivlist.values for a, b in zip(ivlist.values, np.arange(0.2, 20.2, 1)): plt.text(a, b, round(a, 4)) #plt.text(ivlist.values[0]-1, 2.2, '1.148') sns.barplot(x=feature_imp, y=feature_imp.index) #plt.vlines(auc_complete,feature_imp.index[19], feature_imp.index[0]) plt.xlim((0, 1)) #plt.ylim((0, 1)) plt.xlabel('Information Value') plt.ylabel('Attribut') plt.title("Visualizing Important Features") plt.legend() plt.show()
y_sample = y_sample estimator._mvalue(x_sample, y_sample) estimator.fit(x_sample, y_sample) y_pred.append(estimator.predict(x_test)) result = sum(np.array(y_pred)) result[result >= 1] = 1 result[result <= -1] = -1 return result if __name__ == '__main__': data = DataDeal.get_data() Train_data, test = train_test_split(data, test_size=0.2) x_test = test[:, :-1] y_test = test[:, -1] x_train = Train_data[:, :-1] y_train = Train_data[:, -1] kernel_dict = {'type': 'RBF', 'sigma': 0.717} fuzzyvalue = {'type': 'Cen', 'function': 'Exp'} clf = FSVM.FSVM(3, kernel_dict, fuzzyvalue, 3 / 4) bag = Bagging(20, clf, 0.7, 'fsvm', 'UpSampling') y_pred = bag.MutModel_clf(x_train, y_train, x_test)
# y_predict = clf.predict(X_test) # y_test = np.array(y_test) # for i in range(len(y_test)): # if y_test[i] == 0: # y_test[i] = -1 # print(np.mean(y_predict!=y_test)) # precision(y_predict,y_test) # # if __name__ == '__main__': # fsvmTrain('lowSampling') # Test Code for _LSSVMtrain if __name__ == "__main__": data = DataDeal.get_data("../german_numerical.csv") precisionArray = [] X = data[:, :-1] y = data[:, -1] # data = pd.read_csv("../processedData.csv", sep=",", header=0) # # X = applyPcaWithStandardisation(data[data.columns[1:]], 0.9) # X = applyPcaWithNormalisation(data[data.columns[1:]], 0.9) # # X = np.array(data[data.columns[1:]]) # y = np.array(data["default"].map({0: -1, 1: 1})) # parameter = grid_search(X,y,kernel='gaussian') # print(ok) sss = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=12) # sss = StratifiedKFold(n_splits=10, random_state=12, shuffle=True) for train, test in sss.split(X, y): X_test = X[test] y_test = y[test]