def kFoldCV(trainX, trainY, modelIndex, k=10, logFlag=True):

    datas, labels = splitData(trainX, trainY, k)

    res = []
    for i in range(k):
        copydata = copy.deepcopy(datas)  # 备份数据集

        # 生成训练和测试样本
        testArr = copydata[i]
        del copydata[i]
        trainArr = np.vstack(tuple(copydata))

        #  生成测试和训练标签
        copylabel = copy.deepcopy(labels)
        testLabel = copylabel[i]
        del copylabel[i]
        trainLabel = np.hstack(tuple(copylabel))

        # 测试
        rf = buildTrainModel(modelIndex=modelIndex)
        rf.fit(trainArr, trainLabel)
        pred = rf.predict(testArr)
        #pred = np.rint(pred)

        if logFlag:  # 对数变换在做交叉验证时先还原数据
            pred = np.expm1(pred)
            testLabel = np.expm1(testLabel)

        res.append(SMAPE(testLabel, pred))
    print("score mean:", np.mean(res))
    print("score std:", np.std(res))
    return res
def crossValidation(trainX, trainY, modelIndex, cvRate=0.96, cvEpoch=20):

    scores = []
    for i in range(cvEpoch):
        X, Y = shuffle(trainX, trainY)  # 打乱数据
        offset = int(X.shape[0] * cvRate)
        X_train, y_train = X[:offset], Y[:offset]
        X_test, y_test = X[offset:], Y[offset:]
        #y_train = np.log(y_train+1)

        rf = buildTrainModel(modelIndex=modelIndex)

        rf.fit(X_train, y_train)
        pred = rf.predict(X_test)

        # 四舍五入成整数
        #pred = np.exp(pred)
        pred = np.rint(pred)
        acc = SMAPE(y_test, pred)
        scores.append(acc)

    # 去除评分中的inf
    scores = [x for x in scores if str(x) != 'nan' and str(x) != 'inf']
    print("score mean:", np.mean(scores))
    print("score std:", np.std(scores))
    skscores = cross_val_score(rf,
                               trainX,
                               trainY,
                               cv=10,
                               scoring="neg_mean_absolute_error")
    print("sklearn cv mean:", skscores.mean())
    print("sklearn cv std:", skscores.std())
    return scores, skscores
def train(features, trainPath, index, saveName):

    dataDF = pd.read_csv(trainPath, dtype={'link_ID': str})
    print("original dataset columns:", dataDF.columns)
    print("original data num is:", len(dataDF))

    #dataDF = dataDF.dropna(axis=0) # 去除空值记录
    #print("after drop na data num is:", len(dataDF))

    trainX = dataDF[features]
    trainY = dataDF['travel_time']

    trainY = np.log1p(trainY)

    print("trainx shape", trainX.values.shape)
    print("trainY shape", trainY.values.shape)

    rf = buildTrainModel(modelIndex=index)
    #rf = gridSearch(trainX, trainY, modelIndex=modelIndex)
    rf.fit(trainX, trainY)

    #scores, skscores = crossValidation(trainX, trainY, index)
    scores = kFoldCV(trainX, trainY, index, k=5)
    print("cross validation scores:", scores)
    #print("sklearn cross validation scores:", skscores)

    if index == 1 or index == 2:
        print("feature score ", pd.DataFrame(rf.feature_importances_))
    if index == 3:
        #print("feature score ", pd.DataFrame(rf.feature_importances_))
        xgb.plot_importance(rf)
        plt.savefig("../model/importance3.jpg")
    saveSuffix = "../model/"
    joblib.dump(rf, saveSuffix + saveName)
    return rf
def gridSearch(trainx, trainy, modelIndex):

    parameters = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [150, 200, 250],
        'subsample': [0.5, 0.7, 0.9, 1.0],
        'max_depth': [6, 8, 10],
        'max_features': ['sqrt', None]
    }

    rf = buildTrainModel(modelIndex=modelIndex)
    grid_search = GridSearchCV(rf, parameters, verbose=2, cv=10)

    grid_search.fit(trainx, trainy)

    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
Пример #5
0
def train(features, trainPath, testPath, index, saveName, onehotFeature):
    trainDF = pd.read_csv(trainPath, dtype={'link_ID': str})
    print("original dataset columns:", trainDF.columns)
    testDF = pd.read_csv(testPath, dtype={'link_ID': str})

    print(len(trainDF))

    totalDF = pd.concat([trainDF, testDF], axis=0)
    print(len(totalDF))

    trainX = totalDF[features]
    for catgoryFeature in onehotFeature:
        Onehot = pd.get_dummies(trainX[catgoryFeature], prefix=catgoryFeature, sparse=True)
        trainX = pd.concat([trainX, Onehot], axis=1)
    trainX = trainX[trainX["month"] < 6]
    trainX = trainX.drop(onehotFeature, axis=1)
    print(trainX.columns)

    # label 做对数变换
    trainY = trainDF['travel_time']
    trainY = np.log1p(trainY)

    print("trainx shape", trainX.values.shape)
    print("trainY shape", trainY.values.shape)

    rf = buildTrainModel(modelIndex=index)
    # rf = gridSearch(trainX, trainY, modelIndex=modelIndex)
    rf.fit(trainX, trainY)

    # scores, skscores = crossValidation(trainX, trainY, index)
    scores = kFoldCV(trainX, trainY, modelIndex, k=5)
    print("cross validation scores:", scores)
    # print("sklearn cross validation scores:", skscores)


    if index == 1 or index == 2:
        print("feature score ", pd.DataFrame(rf.feature_importances_))
    if index == 3:
        # print("feature score ", pd.DataFrame(rf.feature_importances_))
        xgb.plot_importance(rf)
        plt.savefig("../model/importance3.jpg")
    saveSuffix = "../model/"
    joblib.dump(rf, saveSuffix + saveName)
    return rf