Exemplo n.º 1
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)
                ).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]
                ).all() == True
        assert np.allclose(list(gbm.staged_predict(X)),
                           list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)),
                           list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_
                ).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Exemplo n.º 2
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True
        assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
                                      n_estimators=2,
                                      max_depth=1,
                                      criterion='friedman_mse')
    # 模型训练
    algo.fit(X_train, y_train)
    # 模型效果评估
    print('训练集上的准确率:{}'.format(algo.score(X_train, y_train)))
    print('测试集上的准确率:{}'.format(algo.score(X_test, y_test)))

    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]]
    print('样本预测值:')
    print(algo.predict(x_test))
    print("样本的预测概率值:")
    print(algo.predict_proba(x_test))
    print("样本的预测概率值的Log转换值:")
    print(algo.predict_log_proba(x_test))

    print("训练好的所有子模型:\n{}".format(algo.estimators_))
    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 2.9, 0.8]]
    generator = algo.staged_predict(x_test)
    print('阶段预测值:')
    for i in generator:
        print(i)
    print('各特征属性权重列表:{}'.format(algo.feature_importances_))

    # 所有子模型可视化
    for k, estimators in enumerate(algo.estimators_):
        for j, estimator in enumerate(estimators):
            dot_data = tree.export_graphviz(
                decision_tree=estimator,
                out_file=None,
Exemplo n.º 4
0
class TransitionCostModel(object):
    """
    The transition cost model computes the cost of linking two detections on a
    single-object trajectory.

    Parameters
    ----------
    n_estimators : int
        The number of gradient boosting stages to perform. A larger number
        usually results in increased performance at higher computational cost.

    """

    def __init__(self, n_estimators=100, does_use_tsn=False):
        self._classifier = GradientBoostingClassifier(
            n_estimators=n_estimators)
        self._does_use_tsn = does_use_tsn

    def train(self, positive_pairs, negative_pairs):
        """Train model on pairs of positive and negative detections.

        Parameters
        ----------
        positive_pairs : List[Tuple[int, ndarray, ndarray, ndarray, ndarray]]
            A list of pairs that correspond to neighboring detections on an
            object trajectory. Each list entry contains the following items:

            * Time gap between the two detections (successor time index minus
              predecessor time index)
            * Bounding box coordinates of the predecessor detection in format
              (top-left-x, top-left-y, width, height).
            * Appearance descriptor of the predecessor detection.
            * Bounding box coordinates of the successor detection in format
              (top-left-x, top-left-y, width, height).
            * Appearance descriptor of the successor detection.
        negative_pairs : List[Tuple[int, ndarray, ndarray, ndarray, ndarray]]
            A list of pairs that correspond to two detections of different
            object identities in the same format as positive_pairs.

        """
        # Compute features.
        train_x, train_y = [], []

        if self._does_use_tsn:
            for time_gap, box1, feature1, tsn_feature1, box2, feature2, tsn_feature2 in positive_pairs:
                train_x.append(
                    compute_pairwise_transition_features_with_tsn(
                        time_gap, box1[np.newaxis, :], feature1[np.newaxis, :],
                        tsn_feature1[np.newaxis, :],
                        box2[np.newaxis, :], feature2[np.newaxis, :],
                        tsn_feature2[np.newaxis, :]).ravel())
                train_y.append(1)
            for time_gap, box1, feature1, tsn_feature1, box2, feature2, tsn_feature2 in negative_pairs:
                train_x.append(
                    compute_pairwise_transition_features_with_tsn(
                        time_gap, box1[np.newaxis, :], feature1[np.newaxis, :],
                        tsn_feature1[np.newaxis, :],
                        box2[np.newaxis, :], feature2[np.newaxis, :],
                        tsn_feature2[np.newaxis, :]).ravel())
                train_y.append(0)
        else:
            for time_gap, box1, feature1, box2, feature2 in positive_pairs:
                train_x.append(
                    compute_pairwise_transition_features(
                        time_gap, box1[np.newaxis, :], feature1[np.newaxis, :],
                        box2[np.newaxis, :], feature2[np.newaxis, :]).ravel())
                train_y.append(1)
            for time_gap, box1, feature1, box2, feature2 in negative_pairs:
                train_x.append(
                    compute_pairwise_transition_features(
                        time_gap, box1[np.newaxis, :], feature1[np.newaxis, :],
                        box2[np.newaxis, :], feature2[np.newaxis, :]).ravel())
                train_y.append(0)

        # Shuffle data and train classifier.
        indices = np.random.permutation(len(train_x))
        train_x = np.asarray(train_x)[indices, :]
        train_y = np.asarray(train_y)[indices]
        self._classifier.fit(train_x, train_y)

    def compute_cost(self, time_gap, boxes1, features1, tsn_features1,
                     boxes2, features2, tsn_features2):
        """Compute transition cost from given features.

        Parameters
        ----------
        time_gap : int
            It is assumed that all detections in boxes1 have been obtained at
            the same time step. Likewise, all detections in boxes2 have to be
            obtained at the same time step. The time_gap is the number of time
            steps inbetween the two times (successor time index minus
            predecessor time index).
        boxes1 : ndarray
            The first Nx4 dimensional array of bounding box coordinates in
            format (top-left-x, top-right-y, width, height).
        features1 : ndarray
            The first NxL dimensional array of N appearance features of
            length L.
        tsn_features1 : ndarray
            The first NxL dimensional array of N action features of
            length L.
        boxes2 : ndarray
            The second Mx4 dimensional array of bounding box coordinates in
            format (top-left-x, top-right-y, width, height).
        features2 : ndarray
            The second MxL dimensional array of M appearance features of
            length L.
        tsn_features2 : ndarray
            The second NxL dimensional array of N action features of
            length L.

        Returns
        -------
        ndarray
            Returns the NxM dimensional matrix of element-wise transition costs
            where element (i, j) contains the transition cost between boxes1[i]
            and boxes2[j].

        """
        if self._does_use_tsn:
            features = compute_pairwise_transition_features_with_tsn(
                time_gap, boxes1, features1, tsn_features1,
                boxes2, features2, tsn_features2)
        else:
            features = compute_pairwise_transition_features(
                time_gap, boxes1, features1, boxes2, features2)
        log_probabilities = self._classifier.predict_log_proba(
            features.reshape(len(boxes1) * len(boxes2), features.shape[-1]))
        return -log_probabilities[:, 1].reshape(len(boxes1), len(boxes2))
Exemplo n.º 5
0
# Oversample
Oversampling1000 = train.loc[train.money == 1000]
Oversampling1500 = train.loc[train.money == 1500]
Oversampling2000 = train.loc[train.money == 2000]
for i in range(5):
    train = train.append(Oversampling1000)
for j in range(8):
    train = train.append(Oversampling1500)
for k in range(10):
    train = train.append(Oversampling2000)
# model
clf = GradientBoostingClassifier(n_estimators=300, random_state=2016)
# clf = RandomForestClassifier(n_estimators=500,random_state=2016)
clf = clf.fit(train[predictors], train[target])
result = clf.predict(test[predictors])
result_p = clf.predict_log_proba(test[predictors])
# result_pre=clf.staged_predict_proba(test[predictors])
print 'feature_important:', clf.feature_importances_
# Save results
test_result = pd.DataFrame(columns=["id", 'subsidy'])
test_result.id = ids
test_result.subsidy = result
test_result.subsidy = test_result.subsidy.apply(lambda x: int(x))
# test_result.pre=result_pre
result_compare = pd.merge(test_pre, test_result, on='id')

from sklearn.metrics import f1_score

print result_compare.money.values
print result_compare.subsidy.values
print f1_score(result_compare.money.values,