#print "start" train_data = pd.read_csv(datapath, header=None, index_col=None) X = np.array(train_data) Y = list(map(lambda x: 1, xrange(len(train_data) // 2))) Y2 = list(map(lambda x: 0, xrange(len(train_data) // 2))) Y.extend(Y2) Y = np.array(Y) svc = svm.SVC(probability=True) parameters = {'kernel': ['rbf'], 'C': [math.pow(2,e) for e in range(-5,15,2)], 'gamma': [math.pow(2,e) for e in range(-15, -5, 2)]} #parameters = {'kernel': ['rbf'], 'C':map(lambda x:2**x,np.linspace(-2,5,7)), 'gamma':map(lambda x:2**x,np.linspace(-5,2,7))} clf = GridSearchCV(svc, parameters, cv=crossvalidation_values, n_jobs=CPU_values, scoring='accuracy') clf.fit(X, Y) C=clf.best_params_['C'] gamma=clf.best_params_['gamma'] y_predict=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma),X,Y,cv=crossvalidation_values,n_jobs=CPU_values) y_predict_prob=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma,probability=True),X,Y,cv=crossvalidation_values,n_jobs=CPU_values,method='predict_proba') joblib.dump(clf,path+classifier+mode+outputname+".model") predict_save=[Y.astype(int),y_predict.astype(int),y_predict_prob[:,1]] predict_save=np.array(predict_save).T pd.DataFrame(predict_save).to_csv('Before_'+path+classifier+mode+outputname+"_"+'_predict_crossvalidation.csv',header=None,index=False) ROC_AUC_area=metrics.roc_auc_score(Y,y_predict_prob[:,1]) ACC=metrics.accuracy_score(Y,y_predict) precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict) F1_Score=metrics.f1_score(Y, y_predict) F_measure=F1_Score MCC=metrics.matthews_corrcoef(Y, y_predict) pos=TP+FN neg=FP+TN savedata=[[['SVM'+"C:"+str(C)+"gamma:"+str(gamma),ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]] easy_excel.save(classifier+"_crossvalidation",[str(X.shape[1])],savedata,path+'cross_validation_'+classifier+"_"+outputname+'.xls')
F_measure = F1_Score MCC = metrics.matthews_corrcoef(Y, y_predict) pos = TP + FN neg = FP + TN savedata = [[[ 'xgboost' + "n_estimators:" + str(n_estimators) + "max_depth:" + str(max_depth) + "learning_rate:" + str(learning_rate), ACC, precision, recall, SN, SP, GM, F_measure, F1_Score, MCC, ROC_AUC_area, TP, FN, FP, TN, pos, neg ]]] if ACC > bestACC: bestACC = ACC bestn_estimators = n_estimators bestlearning_rate = learning_rate best_savedata = savedata bestmax_depth = max_depth best_dimension = X.shape[1] print savedata print X.shape[1] with open(classifier + mode + "all_dimension_results.txt", 'a') as f: f.write(str(savedata) + "\n") all_dimension_results.append(savedata) print bestACC print bestn_estimators print bestlearning_rate print bestmax_depth print best_dimension easy_excel.save( "xgboost_jackknife", [str(best_dimension)], best_savedata, path + classifier + mode + 'jackknife_' + outputname + '.xls')
TN_all = TN_all + TN FP_all = FP_all + FP FN_all = FN_all + FN F_measure_all = F_measure_all + F_measure F1_Score_all = F1_Score_all + F1_Score pos_all = pos_all + pos neg_all = neg_all + neg MCC_all = MCC_all + MCC all_y = [ np.array(Y_all).astype(int), np.array(y_pred_all).astype(int), np.array(y_pred_prob_all).astype(list)[:, 1] ] pd.DataFrame(np.matrix(all_y).T).to_csv(path + outputname + "_predict.csv", header=None, index=False) fpr, tpr, thresholds = roc_curve( np.array(Y_all).T, list(np.array(y_pred_prob_all).astype(list)[:, 1])) roc_auc = auc(fpr, tpr) savedata = [[[ 'svm' + "C:" + str(C) + "gamma:" + str(gamma), ACC_all / divided_num, precision_all / divided_num, recall_all / divided_num, SN_all / divided_num, SP_all / divided_num, GM_all / divided_num, F_measure_all / divided_num, F1_Score_all / divided_num, MCC_all / divided_num, roc_auc, TP_all, FN_all, FP_all, TN_all, pos_all, neg_all ]]] print savedata easy_excel.save("svm_independent_test", [str(X_train.shape[1])], savedata, path + outputname + '.xls')
predict_save=np.array(predict_save).T pd.DataFrame(predict_save).to_csv(path+classifier+mode+outputname+"_"+'_predict_crossvalidation.csv',header=None,index=False) ROC_AUC_area=metrics.roc_auc_score(Y,y_predict) ACC=metrics.accuracy_score(Y,y_predict) precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict) F1_Score=metrics.f1_score(Y, y_predict) F_measure=F1_Score MCC=metrics.matthews_corrcoef(Y, y_predict) pos=TP+FN neg=FP+TN savedata=[[['xgboost'+"n_estimators:"+str(n_estimators)+"max_depth:"+str(max_depth)+"learning_rate:"+str(learning_rate),ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]] if ACC>bestACC: bestACC=ACC bestn_estimators=n_estimators bestlearning_rate=learning_rate best_savedata=savedata bestmax_depth=max_depth best_dimension=X.shape[1] print savedata print X.shape[1] with open(classifier+mode+"all_dimension_results.txt",'a') as f: f.write(str(savedata)+"\n") all_dimension_results.append(savedata) print bestACC print bestn_estimators print bestlearning_rate print bestmax_depth print best_dimension easy_excel.save("xgboost_crossvalidation",[str(best_dimension)],best_savedata,path+classifier+mode+'cross_validation_'+name+'.xls')
def SVM_distance(inputname,outputname,distance,crossvalidation_values,CPU_values,SVM_distance_results): datapath =inputname classifier="SVM" mode="crossvalidation" print "start" train_data = pd.read_csv(datapath, header=None, index_col=None) print len(train_data) Y = list(map(lambda x: 1, xrange(len(train_data) // 2))) Y2 = list(map(lambda x: 0, xrange(len(train_data) // 2))) Y.extend(Y2) Y = np.array(Y) F, pval = f_classif(train_data, Y) idx = np.argsort(F) selected_list_=idx[::-1] F_sort_value=[F[e] for e in selected_list_] with open(SVM_distance_results+outputname+"all_dimension_results.txt",'w') as f: f.write(str(F_sort_value)+"\n") with open(SVM_distance_results+outputname+"all_dimension_results.txt",'a') as f: f.write(str(selected_list_)+"\n") print "deal with data" selected_list_=[a for a,b in zip(selected_list_,F_sort_value) if not math.isnan(b)] with open(SVM_distance_results+outputname+"all_dimension_results.txt",'a') as f: f.write(str(selected_list_)+"\n") bestACC=0 best_c=0 best_g=0 best_dimension=0 all_dimension_results=[] select_list=[] best_savedata="" select_num1=0; for select_num in range(0,len(selected_list_),distance): if select_num > 0: for select_num1 in range(select_num-distance+1,select_num+1): temp_data=selected_list_[select_num1] select_list.append(int(temp_data)) train_data2=train_data.values X_train=pd.DataFrame(train_data2) X_train=X_train.iloc[:,select_list] X = np.array(X_train) else: temp_data=selected_list_[select_num] select_list.append(int(temp_data)) train_data2=train_data.values X_train=pd.DataFrame(train_data2) X_train=X_train.iloc[:,select_list] X = np.array(X_train) svc = svm.SVC(probability=True) parameters = {'kernel': ['rbf'], 'C':map(lambda x:2**x,np.linspace(-2,5,7)), 'gamma':map(lambda x:2**x,np.linspace(-5,2,7))} clf = GridSearchCV(svc, parameters, cv=crossvalidation_values, n_jobs=CPU_values, scoring='accuracy') clf.fit(X, Y) C=clf.best_params_['C'] gamma=clf.best_params_['gamma'] y_predict=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma),X,Y,cv=crossvalidation_values,n_jobs=CPU_values) y_predict_prob=cross_val_predict(svm.SVC(kernel='rbf',C=C,gamma=gamma,probability=True),X,Y,cv=crossvalidation_values,n_jobs=CPU_values,method='predict_proba') joblib.dump(clf,SVM_distance_results+outputname+"_"+classifier+mode+str(select_num+1)+".model") predict_save=[Y.astype(int),y_predict.astype(int),y_predict_prob[:,1]] predict_save=np.array(predict_save).T #pd.DataFrame(predict_save).to_csv('Before_'+path+classifier+mode+outputname+"_"+'_predict_crossvalidation.csv',header=None,index=False) ROC_AUC_area=metrics.roc_auc_score(Y,y_predict_prob[:,1]) ACC=metrics.accuracy_score(Y,y_predict) precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict) F1_Score=metrics.f1_score(Y, y_predict) F_measure=F1_Score MCC=metrics.matthews_corrcoef(Y, y_predict) pos=TP+FN neg=FP+TN savedata=[[['SVM'+"C:"+str(C)+"gamma:"+str(gamma),ACC,precision, recall,SN, SP, GM,F_measure,F1_Score,MCC,ROC_AUC_area,TP,FN,FP,TN,pos,neg]]] if ACC>bestACC: bestACC=ACC best_c=C best_g=gamma best_savedata=savedata best_dimension=X.shape[1] print savedata print X.shape[1] with open(SVM_distance_results+outputname+"all_dimension_results.txt",'a') as f: f.write(str(savedata)+"\n") all_dimension_results.append(savedata) print bestACC print best_c print best_g print best_dimension y_predict1=cross_val_predict(svm.SVC(kernel='rbf',C=best_c,gamma=best_g),X,Y,cv=crossvalidation_values,n_jobs=CPU_values) y_predict_prob1=cross_val_predict(svm.SVC(kernel='rbf',C=best_c,gamma=best_g,probability=True),X,Y,cv=crossvalidation_values,n_jobs=CPU_values,method='predict_proba') predict_save1=[Y.astype(int),y_predict1.astype(int),y_predict_prob1[:,1]] predict_save1=np.array(predict_save1).T pd.DataFrame(predict_save1).to_csv(SVM_distance_results+outputname+"_"+classifier+mode+'_best_dim_pro_features.csv',header=None,index=False) easy_excel.save("SVM_crossvalidation",[str(best_dimension)],best_savedata,SVM_distance_results+outputname+"_"+classifier+mode+'_best_ACC.xls') return y_predict_prob1[:,1]
X, Y, cv=10, n_jobs=-1).mean() ACC = metrics.accuracy_score(Y, y_predict) precision, recall, SN, SP, GM, TP, TN, FP, FN = performance( Y, y_predict) F1_Score = metrics.f1_score(Y, y_predict) F_measure = F1_Score MCC = metrics.matthews_corrcoef(Y, y_predict) pos = TP + FN neg = FP + TN whole_result.append([[ 'svm' + "C:" + str(C) + "gamma:" + str(gamma), ACC, precision, recall, SN, SP, GM, F_measure, F1_Score, MCC, ROC_AUC_area, TP, FN, FP, TN, pos, neg ]]) whole_dimension.append(str(X.shape[1])) print whole_result easy_excel.save("svm_crossvalidation", whole_dimension, whole_result, 'svm.xls') # print RFH_PseDNC # RFH_=pd.read_csv(sys.argv[1],header=None,index_col=None) # RFH_=pd.DataFrame(RFH_).astype(float) # PseDNC_=pd.read_csv(sys.argv[2],header=None,index_col=None) # print len(PseDNC_.values[0]) # RFH_PseDNC=pd.concat([RFH_,PseDNC_],axis=1) # pd.DataFrame(RFH_PseDNC).to_csv(sys.argv[3],header=None,index=False) # print RFH_PseDNC
print u'>>>', name, 'is training...searching best parms...' if isMultipleThread: new_thread = ClassifyThread(name, grid_search, X_train, y_train, test_x=X_test, test_y=y_test) new_thread.start() threads.append(new_thread) else: loop_classifier(name, grid_search, X_train, y_train, test_x=X_test, test_y=y_test) else: experiment = '交叉验证结果' print u'>>>', name, 'is cross validating...searching best parms...' if isMultipleThread: new_thread = ClassifyThread(name, grid_search, X, y, cv=cv) new_thread.start() threads.append(new_thread) else: loop_classifier(name, grid_search, X, y, cv=cv) print 'Time cost: ', clock() - sec # 等待所有线程完成 for t in threads: t.join() dimensions.append(str(dimension)) big_results.append(results) print 'Time cost: ', clock() - sec # 保存结果至Excel print '=====================' if easy_excel.save(experiment, dimensions, big_results, excel_name): print 'Save excel result file successfully.' else: print 'Failed. Please close excel result file first.'
X_predict = clf.predict(X) X_predict_proba = clf.predict_proba(X) print X_predict_proba #print X_predict #print len(X_predict) pd.DataFrame(X_predict_proba[:, 1]).to_csv(outputname1 + 'predict_proba.csv', header=None, index=False) ROC_AUC_area = metrics.roc_auc_score(Y, X_predict_proba[:, 1]) ACC = metrics.accuracy_score(Y, X_predict) precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, X_predict) F1_Score = metrics.f1_score(Y, X_predict) F_measure = F1_Score MCC = metrics.matthews_corrcoef(Y, X_predict) pos = TP + FN neg = FP + TN C = clf.best_params_['C'] gamma = clf.best_params_['gamma'] print X.shape[1] print name1 print ACC savedata = [[[ 'SVM' + "C:" + str(C) + "gamma:" + str(gamma), ACC, precision, recall, SN, SP, GM, F_measure, F1_Score, MCC, ROC_AUC_area, TP, FN, FP, TN, pos, neg ]]] easy_excel.save( "SVM_crossvalidation", [str(X.shape[1])], savedata, 'SVM_crossvalidation' + name1 + '_Predict_' + name + '.xls')
else: loop_classifier(name, grid_search, X_train, y_train, test_x=X_test, test_y=y_test) else: experiment = '交叉验证结果' print u'>>>', name, 'is cross validating...searching best parms...' if isMultipleThread: new_thread = ClassifyThread(name, grid_search, X, y, cv=cv) new_thread.start() threads.append(new_thread) else: loop_classifier(name, grid_search, X, y, cv=cv) print 'Time cost: ', clock() - sec # 等待所有线程完成 for t in threads: t.join() dimensions.append(str(dimension)) big_results.append(results) print 'Time cost: ', clock() - sec # 保存结果至Excel print '=====================' if easy_excel.save(experiment, dimensions, big_results, excel_name): print 'Save excel result file successfully.' else: print 'Failed. Please close excel result file first.'
def SVM_calssfier(input_feature, proba_dir, model_dir, result_dir, crossvalidation_values, CPU_values, output_name, t): #print(input_feature) #output_name = input_feature.split("\\")[-1].split(".")[0] #print(output_name) classifier = 'SVM' X = input_feature Y = list(map(lambda x: 1, xrange(len(input_feature) // 2))) Y2 = list(map(lambda x: 0, xrange(len(input_feature) // 2))) Y.extend(Y2) Y = np.array(Y) d = input_feature.shape[1] svc = svm.SVC(probability=True) parameters = { 'kernel': ['rbf'], 'C': map(lambda x: 2**x, np.linspace(-2, 5, 7)), 'gamma': map(lambda x: 2**x, np.linspace(-5, 2, 7)) } clf = GridSearchCV(svc, parameters, cv=crossvalidation_values, n_jobs=CPU_values, scoring='accuracy') clf.fit(X, Y) C = clf.best_params_['C'] gamma = clf.best_params_['gamma'] y_predict = cross_val_predict(svm.SVC(kernel='rbf', C=C, gamma=gamma), X, Y, cv=crossvalidation_values, n_jobs=CPU_values) y_predict_prob = cross_val_predict(svm.SVC(kernel='rbf', C=C, gamma=gamma, probability=True), X, Y, cv=crossvalidation_values, n_jobs=CPU_values, method='predict_proba') joblib.dump(clf, model_dir + "\\" + str(t) + "time.model") predict_save = [y_predict_prob[:, 0], y_predict_prob[:, 1]] predict_save = np.array(predict_save).T pd.DataFrame(predict_save).to_csv(proba_dir + str(t) + 'time.csv', header=None, index=False) ROC_AUC_area = metrics.roc_auc_score(Y, y_predict_prob[:, 1]) ACC = metrics.accuracy_score(Y, y_predict) precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(Y, y_predict) F1_Score = metrics.f1_score(Y, y_predict) F_measure = F1_Score MCC = metrics.matthews_corrcoef(Y, y_predict) pos = TP + FN neg = FP + TN savedata = [[[ 'SVM' + "C:" + str(C) + "gamma:" + str(gamma), ACC, precision, recall, SN, SP, GM, F_measure, F1_Score, MCC, ROC_AUC_area, TP, FN, FP, TN, pos, neg ]]] #print(savedata) #print(result_dir+output_name+'cross_validation_'+str(d)+'D_'+str(t)+'time.xls') easy_excel.save( classifier + "_crossvalidation", [str(X.shape[1])], savedata, result_dir + output_name + 'cross_validation_' + str(d) + 'D_' + str(t) + 'time.xls')
DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=-1), AdaBoostClassifier(), LinearSVC(), GaussianNB()] # 导入原始数据 second = clock() X, y = get_data(input_file) results = [] print 'Time cost on loading data: ', clock() - second # 对数据切分或交叉验证,得出结果 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=split_rate, random_state=0) for name, model in zip(names, classifiers): if cv == 0: print u'>>>', name, 'is training...' out = loop_classifier(name, model, X_train, y_train, test_x=X_test, test_y=y_test) else: print u'>>>', name, 'is cross validating...' out = loop_classifier(name, model, X, y, cv=cv) if out is not None: results.append(out) # 保存结果至Excel print '=====================' if easy_excel.save(str(X_train.shape[1]), results): print 'Save "results.xls" successfully.' else: print 'Fail to save "results.xls". Please close "results.xls" first.'