import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import joblib
from utils.dataprocess import preProc, preProcTest, toJson
from sklearn.model_selection import train_test_split, GridSearchCV

# xgb原生接口

dataTrain = pd.read_csv(r'C:\Users\ZY\Desktop\ML\VI_train.csv')
xTrain, yTrain, scaler = preProc(dataTrain, toNumpy=True)
# weight=反例数量/正例数量
weight = (yTrain.shape[0] - sum(yTrain)) * 1.0 / sum(yTrain)
xTrain, xValidation, yTrain, yValidation = train_test_split(xTrain, yTrain, test_size=0.2)

dtrain = xgb.DMatrix(xTrain, label=yTrain)
dvalidation = xgb.DMatrix(xValidation, label=yValidation)
params = {
    # xgboost宏观特征参数
    'booster': 'gbtree',
    'nthread': 5,  # 线程数
    'silent': 0,  # 为1时,静默开启

    # booster参数
    'eta': 0.1,  # learning rate 通过减少每一步的权重,提高鲁棒性
    'gamma': 0.1,  # 节点分裂所需要的最小损失函数下降值
    'max_depth': 9,  # 最大树高,限制过拟合
    'lambda': 2,  # 权重的L2正则项
    'alpha': 1,  # 权重的L1正则项
        '''
        for i in range(self.layer_num):
            x = self.relu[i](self.bns[i](self.hiddens[i](x)))
        x = self.predict(x)
        return x


# 读入数据
dataTrain = pd.read_csv(r'C:\Users\ZY\Desktop\ML\VI_train.csv')
weight_negative = sum(dataTrain['Response']) * 1.0 / dataTrain.shape[0]
# 由于类别不平衡,采用权重解决,分别设置负类和正类的权重
weights = [weight_negative,
           1 - weight_negative]  # [0.12293666666666667, 0.8770633333333333]
# pytorch要求权重输入为tensor
weights = torch.from_numpy(np.array(weights)).type(torch.FloatTensor)
xTrain, yTrain, scaler = preProc(dataTrain, toTensor=True)
torch.save(xTrain, 'xTrain.pt')
torch.save(yTrain, 'yTrain.pt')
xTrain, xValidation, yTrain, yValidation = train_test_split(xTrain,
                                                            yTrain,
                                                            test_size=0.2,
                                                            random_state=1)

torch_dataset = Data.TensorDataset(xTrain, yTrain)
loader = Data.DataLoader(
    dataset=torch_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
net = Net(14, 15, 2, 3)
print(net)
Пример #3
0
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import joblib
import time
from utils.dataprocess import preProc, preProcTest, toJson
import sklearn.metrics as metrics

t1 = time.time()
# 训练数据处理
dataTrain = pd.read_csv(r'C:\Users\ZY\Desktop\ML\VI_train.csv')
xTrain, yTrain, scaler = preProc(dataTrain)

# 建立模型
max_iter = 100000000
# model里加上class_weight='balanced',等价于正负例分别乘以权重sum(负例)、sum(正例)
# fit里有参数sample_weight,为每个sample赋上权重,是长度等于sample数量的array
# 这两个作用相同,只使用一个
model = SVC(C=1.0, kernel='rbf', gamma='auto', tol=0.2, cache_size=1024, class_weight='balanced', max_iter=max_iter)
model.fit(xTrain, yTrain, sample_weight=None)
score = model.score(xTrain, yTrain)
print('Score:', score)
pred_y = model.predict(xTrain)
fscore = metrics.f1_score(yTrain, pred_y)
print('Fvalue:', fscore)
equal1count = sum(pred_y == 1)
print('预测结果为1的数量:', equal1count)
print(pred_y)
joblib.dump(model, 'carInsurancePredSVM.model')
print('迭代次数', max_iter, '耗时:', time.time() - t1)