示例#1
0
def test_one_hot_encoder_sparse():
    """Test OneHotEncoder's fit and transform."""
    X = [[3, 2, 1], [0, 1, 1]]
    enc = OneHotEncoder()
    # discover max values automatically
    X_trans = enc.fit_transform(X).toarray()
    assert_equal(X_trans.shape, (2, 5))
    assert_array_equal(enc.active_features_,
                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])

    # check outcome
    assert_array_equal(X_trans,
                       [[0., 1., 0., 1., 1.],
                        [1., 0., 1., 0., 1.]])

    # max value given as 3
    enc = OneHotEncoder(n_values=4)
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 4 * 3))
    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])

    # max value given per feature
    enc = OneHotEncoder(n_values=[3, 2, 2])
    X = [[1, 0, 1], [0, 1, 1]]
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
    assert_array_equal(enc.n_values_, [3, 2, 2])
    # check that testing with larger feature works:
    X = np.array([[2, 0, 1], [0, 1, 1]])
    enc.transform(X)

    # test that an error is raised when out of bounds:
    X_too_large = [[0, 2, 1], [0, 1, 1]]
    assert_raises(ValueError, enc.transform, X_too_large)
    assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)

    # test that error is raised when wrong number of features
    assert_raises(ValueError, enc.transform, X[:, :-1])
    # test that error is raised when wrong number of features in fit
    # with prespecified n_values
    assert_raises(ValueError, enc.fit, X[:, :-1])
    # test exception on wrong init param
    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)

    enc = OneHotEncoder()
    # test negative input to fit
    assert_raises(ValueError, enc.fit, [[0], [-1]])

    # test negative input to transform
    enc.fit([[0], [1]])
    assert_raises(ValueError, enc.transform, [[0], [-1]])
示例#2
0
def test_one_hot_encoder_sparse():
    """Test OneHotEncoder's fit and transform."""
    X = [[3, 2, 1], [0, 1, 1]]
    enc = OneHotEncoder()
    # discover max values automatically
    X_trans = enc.fit_transform(X).toarray()
    assert_equal(X_trans.shape, (2, 5))
    assert_array_equal(enc.active_features_,
                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])

    # check outcome
    assert_array_equal(X_trans,
                       [[0., 1., 0., 1., 1.],
                        [1., 0., 1., 0., 1.]])

    # max value given as 3
    enc = OneHotEncoder(n_values=4)
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 4 * 3))
    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])

    # max value given per feature
    enc = OneHotEncoder(n_values=[3, 2, 2])
    X = [[1, 0, 1], [0, 1, 1]]
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
    assert_array_equal(enc.n_values_, [3, 2, 2])
    # check that testing with larger feature works:
    X = np.array([[2, 0, 1], [0, 1, 1]])
    enc.transform(X)

    # test that an error is raised when out of bounds:
    X_too_large = [[0, 2, 1], [0, 1, 1]]
    assert_raises(ValueError, enc.transform, X_too_large)
    assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)

    # test that error is raised when wrong number of features
    assert_raises(ValueError, enc.transform, X[:, :-1])
    # test that error is raised when wrong number of features in fit
    # with prespecified n_values
    assert_raises(ValueError, enc.fit, X[:, :-1])
    # test exception on wrong init param
    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)

    enc = OneHotEncoder()
    # test negative input to fit
    assert_raises(ValueError, enc.fit, [[0], [-1]])

    # test negative input to transform
    enc.fit([[0], [1]])
    assert_raises(ValueError, enc.transform, [[0], [-1]])
示例#3
0
def xgboost_lr_model(train, label):

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(train,
                                                        label,
                                                        test_size=0.3,
                                                        random_state=42)

    # 定义xgb模型
    xgboost = xgb.XGBClassifier(nthread=4,
                                learning_rate=0.08,
                                n_estimators=50,
                                max_depth=5,
                                gamma=0,
                                subsample=0.9,
                                colsample_bytree=0.5)
    # 训练xgb学习
    xgboost.fit(X_train, y_train)

    # 预测xgb及AUC评测
    y_pred_test = xgboost.predict_proba(X_test)[:, 1]
    xgb_test_auc = roc_auc_score(y_test, y_pred_test)
    print('xgboost test auc: %.5f' % xgb_test_auc)

    # xgboost编码原有特征
    X_train_leaves = xgboost.apply(X_train)
    X_test_leaves = xgboost.apply(X_test)

    # 合并编码后的训练数据和测试数据
    All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0)
    All_leaves = All_leaves.astype(np.int32)

    # 对所有特征进行ont-hot编码
    xgbenc = OneHotEncoder()
    X_trans = xgbenc.fit_transform(All_leaves)

    (train_rows, cols) = X_train_leaves.shape

    # 定义LR模型
    lr = LogisticRegression()
    # lr对xgboost特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    xgb_lr_auc1 = roc_auc_score(y_test, y_pred_xgblr1)
    print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1]
    xgb_lr_auc2 = roc_auc_score(y_test, y_pred_xgblr2)
    print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2)
示例#4
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
示例#5
0
 def Predict(self, X_test):
     X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
     gbdtenc = OneHotEncoder()
     self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
     X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test])
     y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1]
     values = []
     for value in y_pred_gbdtlr2:
         values.append(value)
     return values
示例#6
0
def test_one_hot_encoder_dense():
    # check for sparse=False
    X = [[3, 2, 1], [0, 1, 1]]
    enc = OneHotEncoder(sparse=False)
    # discover max values automatically
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 5))
    assert_array_equal(enc.active_features_,
                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])

    # check outcome
    assert_array_equal(X_trans,
                       np.array([[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]))
示例#7
0
    def construct_features(self):
        '''
        Construct features.
        '''
        # Parse date features.
        print "Parsing date features"
        parsed_train_X = self.parse_date_feature(self.train_x[:, 0])
        parsed_test_X = self.parse_date_feature(self.test_x[:, 0])

        # Parse other features.
        print "Parsing all features"
        total_train = len(self.train_x)
        total_test = len(self.test_x)

        for index_feature in range(1, len(self.train_x[0])):
            print "Processing feature ", index_feature

            # Check if we have a categorical feature.
            labels = np.unique(self.train_x[:, index_feature])

            # If we have string or binary labels, we have a categorical feature.
            if type(self.train_x[0, index_feature]) == np.str or len(labels) == 2:
                # We have a categorical feature.

                # Encode it in the one hot format.
                original_data = np.hstack((self.train_x[:, index_feature],
                                           self.test_x[:, index_feature]))

                label_encoder = LabelEncoder()
                data_label_encoded = label_encoder.fit_transform(original_data)
                encoder = OneHotEncoder()
                data_encoded = encoder.fit_transform(data_label_encoded.reshape((len(data_label_encoded), 1)))
                data_encoded = np.asarray(data_encoded.todense()).astype(np.bool)

                # Add encoded feature to data.
                parsed_train_X = np.hstack((parsed_train_X, data_encoded[0:total_train, :]))
                parsed_test_X = np.hstack((parsed_test_X, data_encoded[total_train:, :]))
                del data_encoded
            else:
                # We have a numeric feature.

                # Just add it to the data.
                parsed_train_X = np.hstack((parsed_train_X,
                                            self.train_x[:, index_feature].reshape((total_train, 1))))
                parsed_test_X = np.hstack((parsed_test_X,
                                           self.test_x[:, index_feature].reshape((total_test, 1))))

        self.train_x = parsed_train_X
        self.test_x = parsed_test_X
示例#8
0
def test_one_hot_encoder_dense():
    """check for sparse=False"""
    X = [[3, 2, 1], [0, 1, 1]]
    enc = OneHotEncoder(sparse=False)
    # discover max values automatically
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 5))
    assert_array_equal(enc.active_features_,
                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])

    # check outcome
    assert_array_equal(X_trans,
                       np.array([[0., 1., 0., 1., 1.],
                                 [1., 0., 1., 0., 1.]]))
示例#9
0
def xgboost_lr_train():
    path = ['data_11_02.csv', 'data_11_03.csv']
    data, label = import_data([], [], path)

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        label,
                                                        test_size=0.1,
                                                        random_state=42)

    xgboost = xgb.XGBClassifier(nthread=4,
                                learning_rate=0.02,
                                min_child_weight=5,
                                tree_method='gpu_hist',
                                n_estimators=500,
                                max_depth=5)
    global model_xgb, model_tmp, model_lr
    model_xgb = xgboost.fit(X_train, y_train)

    y_pred_test = xgboost.predict_proba(X_test)[:, 1]
    xgb_test_auc = roc_auc_score(y_test, y_pred_test)
    print('xgboost test auc: %.5f' % xgb_test_auc)

    X_train_leaves = xgboost.apply(X_train)
    X_test_leaves = xgboost.apply(X_test)

    All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0)
    All_leaves = All_leaves.astype(np.int32)

    xgbenc = OneHotEncoder()
    X_trans = xgbenc.fit_transform(All_leaves)

    (train_rows, cols) = X_train_leaves.shape

    lr = LogisticRegression()
    model_tmp = lr.fit(X_trans[:train_rows, :], y_train)
    y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    xgb_lr_auc1 = roc_auc_score(y_test, y_pred_xgblr1)
    print('LR based on XGB AUC: %.5f' % xgb_lr_auc1)

    lr = LogisticRegression(n_jobs=-1)
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    model_lr = lr.fit(X_train_ext, y_train)
    joblib.dump(model_lr, 'lr')
    y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1]
    xgb_lr_auc2 = roc_auc_score(y_test, y_pred_xgblr2)
    print('AUC bsed on combined features of lr: %.5f' % xgb_lr_auc2)
示例#10
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42)
     # 定义GBDT模型
     self.gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (self.train_rows, cols) = self.X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
     X_train_ext = hstack([X_trans[:self.train_rows, :], X_train])
     # lr对组合特征的样本模型训练
     self.lr.fit(X_train_ext, y_train)
示例#11
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                         y_all,
                                                         test_size=0.1,
                                                         random_state=42)
     # 定义GBDT模型
     gbdt = GradientBoostingClassifier(n_estimators=40,
                                       max_depth=3,
                                       verbose=0,
                                       max_features=0.5)
     # 训练模型
     gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     X_train_leaves = gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (train_rows, cols) = X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(
         np.concatenate((X_train_leaves, X_test_leaves), axis=0))
     # 定义LR模型
     lr = LogisticRegression(n_jobs=-1)
     # 组合特征
     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
     # lr对组合特征的样本模型训练
     lr.fit(X_train_ext, y_train)
     # 预测及AUC评测
     filename = 'finalized_model.sav'
     pickle.dump(lr, open(filename, 'wb'))
     # load the model from disk
     loaded_model = pickle.load(open(filename, 'rb'))
     y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1]
     print(y_pred_gbdtlr2)
def gbdt_lr_train():
    cv_lr_scores = []
    cv_lr_trans_scores = []
    cv_lr_trans_raw_scores = []
    cv_gbdt_scores = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    for train_index, valid_index in skf.split(X, y):
        X_train = X[train_index]
        X_valid = X[valid_index]
        y_train = y[train_index]
        y_valid = y[valid_index]

        # 定义GBDT模型
        gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5)
        # 训练学习
        gbdt.fit(X_train, y_train)
        y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1]
        gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt)
        print('基于原有特征的gbdt auc: %.5f' % gbdt_auc)
        cv_gbdt_scores.append(gbdt_auc)

        # lr对原始特征样本模型训练
        lr = LogisticRegression()
        lr.fit(X_train, y_train)  # 预测及AUC评测
        y_pred_test = lr.predict_proba(X_valid)[:, 1]
        lr_valid_auc = roc_auc_score(y_valid, y_pred_test)
        print('基于原有特征的LR AUC: %.5f' % lr_valid_auc)
        cv_lr_scores.append(lr_valid_auc)

        # GBDT编码原有特征
        X_train_leaves = gbdt.apply(X_train)[:, :, 0]
        X_valid_leaves = gbdt.apply(X_valid)[:, :, 0]

        # 对所有特征进行ont-hot编码
        (train_rows, cols) = X_train_leaves.shape

        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0))

        # 定义LR模型
        lr = LogisticRegression()
        # lr对gbdt特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], y_train)
        # 预测及AUC评测
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)
        cv_lr_trans_scores.append(gbdt_lr_auc1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], X_train])
        X_valid_ext = hstack([X_trans[train_rows:, :], X_valid])

        print(X_train_ext.shape)
        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, y_train)

        # 预测及AUC评测
        y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1]
        gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2)
        print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
        cv_lr_trans_raw_scores.append(gbdt_lr_auc2)

    cv_lr = np.mean(cv_lr_scores)
    cv_lr_trans = np.mean(cv_lr_trans_scores)
    cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores)
    cv_gbdt = np.mean(cv_gbdt_scores)
    print("==" * 20)
    print("gbdt原始特征cv_gbdt:", cv_gbdt)
    print("lr原始特征cv_lr:", cv_lr)
    print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans)
    print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def xgb_lr_train():
    cv_lr_scores = []
    cv_lr_trans_scores = []
    cv_lr_trans_raw_scores = []
    cv_xgb_scores = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    for train_index, valid_index in skf.split(X, y):
        X_train = X[train_index]
        X_valid = X[valid_index]
        y_train = y[train_index]
        y_valid = y[valid_index]

        # 定义xgb模型
        xgboost = xgb.XGBClassifier(nthread=4, learning_rate=0.08,
                                    n_estimators=100, max_depth=4,
                                    gamma=0, subsample=0.7, colsample_bytree=0.7,
                                    verbosity=1)
        # 训练学习
        xgboost.fit(X_train, y_train)
        y_pred_valid = xgboost.predict_proba(X_valid)[:, 1]
        xgb_valid_auc = roc_auc_score(y_valid, y_pred_valid)
        print('基于原有特征的xgb auc: %.5f' % xgb_valid_auc)
        cv_xgb_scores.append(xgb_valid_auc)

        # xgboost编码原有特征
        X_train_leaves = xgboost.apply(X_train)
        X_valid_leaves = xgboost.apply(X_valid)
        # 合并编码后的训练数据和测试数据
        All_leaves = np.concatenate((X_train_leaves, X_valid_leaves), axis=0)
        All_leaves = All_leaves.astype(np.int32)
        # 对所有特征进行ont-hot编码
        xgbenc = OneHotEncoder()
        X_trans = xgbenc.fit_transform(All_leaves)
        (train_rows, cols) = X_train_leaves.shape

        # 定义LR模型
        lr = LogisticRegression()
        # lr对xgboost特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], y_train)
        # 预测及AUC评测
        y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        xgb_lr_auc1 = roc_auc_score(y_valid, y_pred_xgblr1)
        print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1)
        cv_lr_trans_scores.append(xgb_lr_auc1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], X_train])
        X_test_ext = hstack([X_trans[train_rows:, :], X_valid])

        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, y_train)

        # 预测及AUC评测
        y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1]
        xgb_lr_auc2 = roc_auc_score(y_valid, y_pred_xgblr2)
        print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2)
        cv_lr_trans_raw_scores.append(xgb_lr_auc2)
    cv_lr_trans = np.mean(cv_lr_trans_scores)
    cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores)
    cv_xgb = np.mean(cv_xgb_scores)

    print("==" * 20)
    print("xgb原始特征cv_gbdt:", cv_xgb)
    print("lr基于xgb的特征cv_lr_trans:", cv_lr_trans)
    print("lr基于xgb特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
示例#14
0
def xgboost_lr_train(libsvmTrain, libsvmTest):
    # 训练/测试数据分割
    X_train, y_train = load_svmlight_file(libsvmTrain, n_features=1491, offset=1)
    X_test, y_test = load_svmlight_file(libsvmTest, n_features=1491, offset=1)
    print("train top5 example:\n")
    print_example4cpp(X_train, 5, 0)

    print("test top5 example:\n")
    print_example4cpp(X_test, 5, 0)

    # 定义xgb模型
    # nthread = 4, gamma = 0, subsample = 0.9, colsample_bytree = 0.5,
    xgboost = xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=100,
        max_depth=3,
        max_leaf_nodes=10,
        # eval_metric="auc",
        missing=-999
    )

    # xgboost = xgb.XGBRegressor(nthread=4, learning_rate=1,
    #                            n_estimators=2, max_depth=3, gamma=0, subsample=0.9, colsample_bytree=0.5)

    # 训练xgb学习
    xgboost.fit(X_train, y_train)
    xgboost.save_model("../data/test1.model")

    cp_model = xgb.Booster(model_file='../data/test1.model')
    cp_model.dump_model("../data/test1.raw.txt")

    # 预测xgb及AUC评测
    y_pred_test1 = xgboost.predict_proba(X_test)[:, 1]     # for classifier
    # y_pred_test2 = xgboost.predict(X_test)
    xgb_test_auc = roc_auc_score(y_test, y_pred_test1)
    print('xgboost test auc: %.5f' % xgb_test_auc)
    print("test top5 pred1: {}".format(list(zip(y_test[:5], y_pred_test1[:5]))))
    # print("test top5 pred2: {}".format(list(zip(y_test[:5], y_pred_test2[:5]))))

    # xgboost编码原有特征
    X_train_leaves = xgboost.apply(X_train)
    X_test_leaves = xgboost.apply(X_test)

    # 合并编码后的训练数据和测试数据
    All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0)
    All_leaves = All_leaves.astype(np.int32)

    print("X_train leaves index: \n{}".format(X_train_leaves[:5]))

    # 对所有特征进行ont-hot编码
    xgbenc = OneHotEncoder()
    X_trans = xgbenc.fit_transform(All_leaves)

    print("X_train top5 onehot encode:")
    for i in range(5):
        print(X_trans[i].toarray())
    (train_rows, cols) = X_train_leaves.shape
    print("\nnew x_train shape for lr: {}".format(X_train_leaves.shape))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对xgboost特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    xgb_lr_auc1 = roc_auc_score(y_test, y_pred_xgblr1)
    print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1, max_iter=100)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1]
    xgb_lr_auc2 = roc_auc_score(y_test, y_pred_xgblr2)
    print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2)
示例#15
0
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline):

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3)
    #n_estimators=20, max_depth=3, verbose=0, max_features=0.5

    # 训练学习
    gbdt.fit(train[gbdt_features], train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1]
        gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)
    else:
        y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1]
        gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0]
    X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], train[target])
    
    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1)
    else:
        print('Online')

    # 定义LR模型
    lr = LogisticRegression()
    
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]])
    X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]])
    
    print("gbdt output",X_trans[:train_rows, :].shape)
    print("input",train[lr_features].shape)
    print(X_train_ext.shape)
    
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
        gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2)
        print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2)
    else:
        print('Online')
        
        test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1]
        print(test['predicted_score'].head(5))
        print(len(test))
        test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果
        print('Saved result success!')
示例#16
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    print "train data shape: ", X_train.shape

    # 模型训练
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]
    print "gbdt leaves shape: ", X_train_leaves.shape
    for i in range(0, len(X_train_leaves[0])):
        cateMap = {}
        for j in range(0, len(X_train_leaves)):
            cateMap[X_train_leaves[j][i]] = 0
        print "F%d: %d" % (i, len(cateMap))

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    gbdtenc = OneHotEncoder(sparse=False, categories='auto')
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print "gbdt oneHot shape: ", X_trans.shape
    print "oneHot leaves: ", X_trans[0]
    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print "gbdt leaves cross", X_train_ext.shape
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
offline.FLAG = offline['FLAG'].astype(np.float64)
fpr, tpr, thresholds = metrics.roc_curve(offline.FLAG,offline['preds'])
print('AUC:',metrics.auc(fpr, tpr))
#线上
print("线上预测")
online=DataFrame()
preds_online =  lgb_model.predict(online_test_X, num_iteration=lgb_model.best_iteration)  # 输出概率
online['USRID']=test_data['USRID'].astype(int)
online['RST']=preds_online
online.to_csv("test_result.csv",index=False,sep='\t')    
#生成特征后的线下预测
from sklearn.preprocessing import OneHotEncoder
onehot=OneHotEncoder()
train_new_feature=lgb_model.predict(train_data.drop(['USRID','FLAG'],axis=1),pred_leaf=True)
test_new_feature=lgb_model.predict(test_data.drop(['USRID'],axis=1),pred_leaf=True)
train_new_feature=onehot.fit_transform(train_new_feature)
test_new_feature=onehot.fit_transform(test_new_feature)
train_new=pd.concat([train_data,DataFrame(train_new_feature.toarray())],axis=1)
test_new=pd.concat([test_data,DataFrame(test_new_feature.toarray())],axis=1)
#建立LR模型
from sklearn.linear_model import LogisticRegressionCV
LR1=LogisticRegressionCV(Cs=[0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,20,30,100],
                        cv=3,penalty='l2',solver='lbfgs',scoring='auc',class_weight='balanced',

                        n_jobs=-1,random_state=42,verbose=1)
LR1.fit(train_new.drop(['USRID','FLAG'],axis=1),train_new.FLAG)

### 线下预测
print ("线下预测")
preds_offline = lgb_model.predict(offline_test_X, num_iteration=lgb_model.best_iteration) # 输出概率
offline=offline_test[['USRID','FLAG']]
示例#18
0
data_process(train_agg)
data_process(test_agg)

del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg
#gbdt 构造新特征
gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None)
X_train=train_agg.drop(['USRID','FLAG'],axis=1)
y_train=train_agg['FLAG']
# 训练学习
gbdt.fit(X_train, y_train)
# GBDT编码原有特征
X_train_leaves = gbdt.apply(X_train)[:,:,0]
X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0]
(train_rows, cols) = X_train_leaves.shape
onehot = OneHotEncoder()
X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

# 组合特征
X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray())
X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray())
X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True)
X_test_agg.rename(columns={494: "USRID"},inplace=True)

#训练集和测试集

train_data=pd.merge(X_train_agg,train_log,on='USRID',how='left')
test_data=pd.merge(X_test_agg,test_log,on='USRID',how='left')
del X_train_agg,X_test_agg,train_log,test_log
#建模
import lightgbm as lgb
train_xy,offline_test = train_test_split(train_data,test_size = 0.3,random_state=42)
示例#19
0
def _run_one_hot(X, X2, cat):
    enc = OneHotEncoder(categorical_features=cat)
    Xtr = enc.fit_transform(X)
    X2tr = enc.transform(X2)
    return Xtr, X2tr
示例#20
0
                                                                      data.loc[data['Apartment'] == i, 'Price'].mean()
for i in range(1, 5):
    data.loc[data['Beds'] == i, 'expensive than average bed'] = data.loc[data['Beds'] == i, 'Price'] - \
                                                                data.loc[data['Beds'] == i, 'Price'].mean()
threshold1 = Binarizer(threshold=3.0)
res1 = pd.DataFrame(threshold1.transform(data['Review'].values.reshape(-1, 1)))
threshold2 = Binarizer(threshold=80)
res2 = pd.DataFrame(threshold2.transform(data['Price'].values.reshape(-1, 1)))
pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

res3 = pd.DataFrame(
    pf.fit_transform(
        data[['Apartment', 'Beds', 'Review', 'Pic Quality', 'Price']]))

encoder = OneHotEncoder()
data_region1hot = encoder.fit_transform(data['Region'].values.reshape(-1, 1))
data_region = pd.DataFrame(data_region1hot.toarray())
data_weekday1hot = encoder.fit_transform(data['Weekday'].values.reshape(-1, 1))
data_weekday = pd.DataFrame(data_weekday1hot.toarray())
data_reformed = pd.concat(
    [data.drop(columns=['ID']), data_region, data_weekday, res1, res2, res3],
    axis=1)

Seed = 40

split = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=Seed)
for train_index, test_index in split.split(data_reformed,
                                           data_reformed['Accept']):
    train = data_reformed.loc[train_index]
    test = data_reformed.loc[test_index]
train_data = train.loc[:, train.columns != 'Accept']
fin.close()
sents = nltk.sent_tokenize(" ".join(lines))

tokenizer = Tokenizer(5000)
tokens = tokenizer.fit_on_texts(sents)
vocab_size = len(tokenizer.word_counts) + 1

xs = []
ys = []
for sent in sents:
    embedding = one_hot(sent, vocab_size)
    triples = list(nltk.trigrams(embedding))
    w_lefts = [x[0] for x in triples]
    w_centers = [x[1] for x in triples]
    w_rights = [x[2] for x in triples]
    xs.extend(w_centers)
    ys.extend(w_lefts)
    xs.extend(w_centers)
    ys.extend(w_rights)

ohe = OneHotEncoder(n_values=vocab_size)
X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense()
Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3,
                                                random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)


print('End')

示例#22
0
_, xgb_calibration = calibration(preds, dtest)
print('训练Xgb模型 auc: {:.4f}, ne: {:.4f}, logloss: {:.4f}, calibration: {:.4f}'.
      format(xgb_auc1, xgb_ne, xgb_lls, xgb_calibration))
# print("test top5 pred: {}".format([(labels[i], preds[i]) for i in range(5)]))

# onehot
dr_leaves = bst.predict(dtrain, pred_leaf=True)
dt_leaves = bst.predict(dtest, pred_leaf=True)

all_leaves = np.concatenate((dr_leaves, dt_leaves), axis=0)
all_leaves = all_leaves.astype(np.int32)

# print("dtrain leaves index: \n{}".format(dr_leaves[:5]))

xgb_enc = OneHotEncoder()
X_trans = xgb_enc.fit_transform(all_leaves)
# print("X_train top5 onehot encode:")
# for i in range(5):
#     print(X_trans[i].toarray())

(train_rows, cols) = dr_leaves.shape
print("\nnew x_train shape for lr: {}".format(dr_leaves.shape))
# 定义LR模型
lr = LogisticRegression(max_iter=200)
# lr对xgboost特征编码后的样本模型训练
lr.fit(X_trans[:train_rows, :], dtrain.get_label())
# 预测及AUC评测
y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
xgb_lr_auc1 = roc_auc_score(dtest.get_label(), y_pred_xgblr1)
_, xgb_lr_ne = normalized_entropy(y_pred_xgblr1, dtest)
_, xgb_lr_lls = log_loss(y_pred_xgblr1, dtest)
def get_cat_one_hot_feature(tag='val'):

    if tag == 'train':

        print("获取类别特征one_hot线上提交数据")

        print("训练集长度: " + '478032')
        print("测试集长度:" + '42888')

        path = config.cache_prefix_path + 'cat_one_hot_train.npz'

        if os.path.exists(path):
            cat_one_hot_train = utils.load_sparse_csr(path)
            return cat_one_hot_train

        data = pd.read_pickle(config.data_prefix_path +
                              'data.pkl')[config.CAT_COLS]

        labelEncoding = LabelEncoder()
        for col in data.head(0):
            data[col] = labelEncoding.fit_transform(data[col].astype(str))

        onehotEncoding = OneHotEncoder()
        data = onehotEncoding.fit_transform(data)
        print(data.shape)

        utils.save_sparse_csr(path, data)

        return data

    elif tag == 'val':

        print("获取类别特征one_hot线下验证数据")

        print("训练集长度: " + '420627')
        print("验证集长度:" + '57405')

        path = config.cache_prefix_path + 'cat_one_hot_val.npz'

        if os.path.exists(path):
            cat_one_hot_val = utils.load_sparse_csr(path)
            return cat_one_hot_val

        data = pd.read_pickle(config.data_prefix_path +
                              'data.pkl')[config.CAT_COLS + ['day']]

        train = data[data.day < 24]
        test = data[data.day == 24]

        del data
        gc.collect()

        train.drop(['day'], axis=1, inplace=True)
        test.drop(['day'], axis=1, inplace=True)

        data = pd.concat([train, test], axis=0)
        del train, test

        gc.collect()

        labelEncoding = LabelEncoder()
        for col in data.head(0):
            data[col] = labelEncoding.fit_transform(data[col].astype(str))

        onehotEncoding = OneHotEncoder()
        data = onehotEncoding.fit_transform(data)
        print(data.shape)

        utils.save_sparse_csr(path, data)

        return data
def get_xgboost_one_hot_feature(tag='val'):

    params = {
        'booster': 'gbtree',
        'num_leaves': 35,
        'max_depth': 7,
        'eta': 0.05,
        'max_bin': 425,
        'subsample_for_bin': 50000,
        'objective': 'binary:logistic',
        'min_split_gain': 0,
        'min_child_weight': 6,
        'min_child_samples': 10,
        #'colsample_bytree':0.8,#在建立树时对特征采样的比例。缺省值为1
        #'subsample':0.9,#用于训练模型的子样本占整个样本集合的比例。
        'subsample_freq': 1,
        'colsample_bytree': 1,
        'reg_lambda': 4,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
        'alpha': 4,  #L1正则化 
        'seed': 2018,
        'nthread': 7,
        'silent': True,
        'gamma': 0.2,
        'eval_metric': 'logloss'
    }

    object_features = [
        "predict_category_1",
        "predict_category_2",
        "predict_category_0",
        "predict_property_0",
        "predict_property_1",
        "predict_property_2",
        "property_1",
        "property_0",
        "property_2",
        "category_1",
        "category_0",
        "category_2",
        'category_cross_0',
        'category_cross_1',
        'category_cross_2',
        'hour_and_category_1',
        'user_gender_id',
        'user_occupation_id',
    ]

    if tag == 'train':

        print("获取xgboost特征one_hot线上提交数据")

        print("训练集长度: " + '478032')
        print("测试集长度:" + '42888')

        path = config.cache_prefix_path + 'xgboost_one_hot_train.npz'

        if os.path.exists(path):
            xgboost_one_hot_train = utils.load_sparse_csr(path)
            return xgboost_one_hot_train

        data = pd.read_pickle(config.data_prefix_path + 'data.pkl')

        features = [
            c for c in data.columns if c not in [
                'is_trade',
                'instance_id',
                'index',
                'context_id',
                'time',
                'day',
                'context_timestamp',
                'property_list',
                'category_list',
                'property_predict_list',
                'category_predict_list',
                'item_category_list',
                'item_property_list',
                'predict_category_property',
                'user_id',
                'item_id',
                'item_brand_id',
                'item_city_id',
                'shop_id',
            ] and c not in object_features
        ]
        target = ['is_trade']

        train = data[data.is_trade.notnull()]
        test = data[data.is_trade.isnull()]
        del data
        gc.collect()

        xgb_train = xgb.DMatrix(train[features], label=train[target])
        xgb_test = xgb.DMatrix(test[features])
        del train, test
        gc.collect()

        model = xgb.train(params, xgb_train, 200, [(xgb_train, 'train')])

        train_leaves = model.predict(xgb_train, pred_leaf=True)
        test_leaves = model.predict(xgb_test, pred_leaf=True)
        del xgb_train, xgb_test
        gc.collect()

        onehotEncoding = OneHotEncoder()
        trans = onehotEncoding.fit_transform(
            np.concatenate((train_leaves, test_leaves), axis=0))

        utils.save_sparse_csr(path, trans)
        return trans

    elif tag == 'val':

        print("获取xgboost特征one_hot线下验证数据")

        print("训练集长度: " + '420627')
        print("测试集长度:" + '57405')

        path = config.cache_prefix_path + 'xgboost_one_hot_val.npz'

        if os.path.exists(path):
            xgboost_one_hot_val = utils.load_sparse_csr(path)
            return xgboost_one_hot_val

        data = pd.read_pickle(config.data_prefix_path + 'data.pkl')

        features = [
            c for c in data.columns if c not in [
                'is_trade',
                'instance_id',
                'index',
                'context_id',
                'time',
                'day',
                'context_timestamp',
                'property_list',
                'category_list',
                'property_predict_list',
                'category_predict_list',
                'item_category_list',
                'item_property_list',
                'predict_category_property',
                'user_id',
                'item_id',
                'item_brand_id',
                'item_city_id',
                'shop_id',
            ] and c not in object_features
        ]
        target = ['is_trade']

        data = data[data.is_trade.notnull()]
        train = data[data.day < 24]
        val = data[data.day == 24]

        xgb_train = xgb.DMatrix(train[features], label=train[target])
        xgb_val = xgb.DMatrix(val[features], label=val[target])

        del train, val, data
        gc.collect()

        model = xgb.train(params, xgb_train, 200, [(xgb_train, 'train'),
                                                   (xgb_val, 'valid')])

        train_leaves = model.predict(xgb_train, pred_leaf=True)
        val_leaves = model.predict(xgb_val, pred_leaf=True)

        del xgb_train, xgb_val
        gc.collect()

        onehotEncoding = OneHotEncoder()
        trans = onehotEncoding.fit_transform(
            np.concatenate((train_leaves, val_leaves), axis=0))

        utils.save_sparse_csr(path, trans)
        return trans
示例#25
0
def _run_one_hot(X, X2, cat):
    enc = OneHotEncoder(categorical_features=cat)
    Xtr = enc.fit_transform(X)
    X2tr = enc.transform(X2)
    return Xtr, X2tr
示例#26
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)
    # X_all_dense = X_all.todense()
    print(type(X_all))
    # print(type(X_all_dense[0]))
    # print(y_all)
    # print("===")

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    # print(X_train)
    # print(y_train)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    toarray = X_test.toarray()
    print(type(toarray))
    y_pred_gbdt = gbdt.predict_proba(toarray)
    # print(y_pred_gbdt)
    y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)  # gbdt auc: 0.96455

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)  # 基于原有特征的LR AUC: 0.93455

    # GBDT编码原有特征
    # X_train_leaves = gbdt.apply(X_train)
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    np.set_printoptions(linewidth=400)
    np.set_printoptions(threshold=np.inf)
    # print(X_train_leaves[0:22,:])  # 打印22行,所有列
    print(type(X_train_leaves))
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    print(train_rows, cols)

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print(X_trans.shape)
    # print(X_trans.todense()[0:22,:])

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    # print(X_trans[train_rows:, :])
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print("组合特征的个数:", X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)