def lrisTrain(): data0 = dataUtil.load_data('../data/data.csv') data = data0.iloc[:, 1:] str = [ 'number', 'cql', 'vitalCapacity', 'a', 'zxqdcs', 'age', 'BMI', 'label' ] data = data[str] # 标准化 # data = dataUtil.standardization(data) print('*' * 10, 'data(标准化之后)') print(data.head(10)) # 把数据分为测试数据和验证数据 # 划分x,y x, y = dataUtil.get_x_y(data) print('*' * 10, 'x') print(x) print('*' * 10, 'y') print(y) # 划分训练集测试集 train_data, test_data, train_target, test_target = dataUtil.k_fold(x, y) print('*' * 10, 'train_data') print(train_data) print('*' * 10, 'test_data') print(test_data) print('*' * 10, 'train_target') print(train_target) print('*' * 10, 'test_target') print(test_target) # Model(建模)-引入决策树 # 决策树 # clf = tree.DecisionTreeClassifier(criterion="entropy") # AdaBoost clf = tree.DecisionTreeRegressor() # clf = tree.AdaBoostClassifier( # base_estimator=tree.DecisionTreeClassifier(max_depth=5, min_samples_split=30, min_samples_leaf=5), # n_estimators=10, learning_rate=0.2) # 训练集进行训练 clf.fit(train_data, train_target) # # 进行预测 y_pred = clf.predict(test_data) # # 法一:通过准确率进行验证 print(metrics.accuracy_score(y_true=test_target, y_pred=y_pred)) # 画图方法1-生成dot文件 # with open('treeone.dot', 'w') as f: # dot_data = tree.export_graphviz(clf, out_file=None) # f.write(dot_data) # 画图方法2-生成pdf文件 # dot_data = tree.export_graphviz(clf, out_file=None, feature_names=clf.feature_importances_, # filled=True, rounded=True, special_characters=True) dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) # # 保存图像到pdf文件 graph.write_pdf("treetwo.pdf")
def draw_RF_DT(): plt.rcParams['font.sans-serif'] = ['SimHei'] ## 中文黑体 # # data RF = dataUtil.load_data('../data/RF.csv').iloc[:, 1:] DT = dataUtil.load_data('../data/DT.csv').iloc[:, 1:] RF_ACC = RF.iloc[:, 0].values RF_Precision = RF.iloc[:, 1].values RF_Recall = RF.iloc[:, 2].values DT_ACC = DT['ACC'].values DT_Precision = DT.iloc[:, 1].values DT_Recall = DT.iloc[:, 2].values x = [1, 2, 3, 4, 5, 6] lw = 1 fig = plt.figure() ax1 = fig.add_subplot(1, 1, 1) ax1.plot(x, RF_ACC, color='r', lw=lw, linestyle='-', label='RF_ACC') ax1.plot(x, DT_ACC, color='r', lw=lw, linestyle='--', label='DT_ACC') ax1.plot(x, RF_Precision, color='g', lw=lw, linestyle='-', label='RF_Precision') ax1.plot(x, DT_Precision, color='g', lw=lw, linestyle='--', label='DT_Precision') ax1.plot(x, RF_Recall, color='b', lw=lw, linestyle='-', label='RF_Recall') ax1.plot(x, DT_Recall, color='b', lw=lw, linestyle='--', label='DT_Recall') # ax1.set_xlim([9, 7, 5, 3, 2, 1]) ax1.set_xlabel('子集编号') ax1.set_ylabel('指标') # ax1.set_title('P-R曲线') plt.legend(loc="lower right") plt.savefig('../pic/RF_DT.png', dpi=400, bbox_inches='tight') plt.show()
# encoding=utf-8 """ @Time : 2020/6/20 15:35 @Author : LiuYanZhe @File : GA.py @Software: PyCharm @Description: 使用遗传算法优化神经网络 """ import matplotlib.pyplot as plt from sko.GA import GA from math_model.py import ANN import numpy as np import pandas as pd from math_model.util import dataUtil data = dataUtil.load_data('../data/data_22.csv') print('data shape:', data.shape) # data = data.iloc[:, 2:] # 标准化 # data = dataUtil.standardization(data) input_num = 14 # 划分x,y x, y = dataUtil.get_x_y(data) print('x:', x.shape) print('y:', y.shape) # 正则化 # x = dataUtil.normalization(x) # 标准化 x = dataUtil.standardization2(x) # 归一化 # x = dataUtil.scale(x)
def get_probability(index, data): data1 = data[index] d_0 = data1[data1['label'] == 0] d_1 = data1[data1['label'] == 1] return len(d_1) / (len(d_1) + len(d_0)) # 返回类中的阴性点 def get_yin(index, data): data1 = data[index] d_0 = data1[data1['label'] == 0] num = d_0['number'] return num data0 = dataUtil.load_data('../data/data.csv') # 标准化 # data0 = dataUtil.standardization(data0) data = data0.iloc[:, 1:data0.shape[1] - 1] # data0 = data0.iloc[:, 1:] # data0 = data0.drop(['wzqdcsd', 'wzqdcsC', 'wzqdcsA', 'wzqdcsB'], axis=1) # data0 = data0.drop(['wzqdcsC', 'wzqdcsA', 'wzqdcsB'], axis=1) # 归一化 # data = dataUtil.scale(data) best_j = [] # 类别个数 best_sc = [] # 比例 best_i = [] # 第几类 num_list = set() # 创建集合,存储num,不重复 print(data) for j in range(2, 100): print('*' * 20, j)
# encoding=utf-8 """ @Time : 2020/6/21 21:34 @Author : LiuYanZhe @File : set_nan.py @Software: PyCharm @Description: 填充缺失值 """ from sklearn.linear_model import Ridge, Lasso from math_model.util import dataUtil import matplotlib.pyplot as plt import numpy as np import pandas as pd # 读取数据 data0 = dataUtil.load_data('../data/data_nan.csv') data_nan, data_no_nan, y_train = dataUtil.get_nan_nonan(data0) # 划分数据 x_train = data_no_nan.iloc[:, 6:data_no_nan.shape[1] - 1] x_pre = data_nan.iloc[:, 6:data_no_nan.shape[1] - 1] print(x_train) print(x_pre) print(y_train) # clf = Ridge(alpha=.5) clf = Ridge() clf.fit(x_train, y_train) print(clf.coef_) # 相关系数 print(clf.intercept_) # 截距 y_pre = clf.predict(x_pre) print('*' * 10, 'pre') print(y_pre)