def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X) ).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1] ).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_ ).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
n_estimators=2, max_depth=1, criterion='friedman_mse') # 模型训练 algo.fit(X_train, y_train) # 模型效果评估 print('训练集上的准确率:{}'.format(algo.score(X_train, y_train))) print('测试集上的准确率:{}'.format(algo.score(X_test, y_test))) x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]] print('样本预测值:') print(algo.predict(x_test)) print("样本的预测概率值:") print(algo.predict_proba(x_test)) print("样本的预测概率值的Log转换值:") print(algo.predict_log_proba(x_test)) print("训练好的所有子模型:\n{}".format(algo.estimators_)) x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 2.9, 0.8]] generator = algo.staged_predict(x_test) print('阶段预测值:') for i in generator: print(i) print('各特征属性权重列表:{}'.format(algo.feature_importances_)) # 所有子模型可视化 for k, estimators in enumerate(algo.estimators_): for j, estimator in enumerate(estimators): dot_data = tree.export_graphviz( decision_tree=estimator, out_file=None,
class TransitionCostModel(object): """ The transition cost model computes the cost of linking two detections on a single-object trajectory. Parameters ---------- n_estimators : int The number of gradient boosting stages to perform. A larger number usually results in increased performance at higher computational cost. """ def __init__(self, n_estimators=100, does_use_tsn=False): self._classifier = GradientBoostingClassifier( n_estimators=n_estimators) self._does_use_tsn = does_use_tsn def train(self, positive_pairs, negative_pairs): """Train model on pairs of positive and negative detections. Parameters ---------- positive_pairs : List[Tuple[int, ndarray, ndarray, ndarray, ndarray]] A list of pairs that correspond to neighboring detections on an object trajectory. Each list entry contains the following items: * Time gap between the two detections (successor time index minus predecessor time index) * Bounding box coordinates of the predecessor detection in format (top-left-x, top-left-y, width, height). * Appearance descriptor of the predecessor detection. * Bounding box coordinates of the successor detection in format (top-left-x, top-left-y, width, height). * Appearance descriptor of the successor detection. negative_pairs : List[Tuple[int, ndarray, ndarray, ndarray, ndarray]] A list of pairs that correspond to two detections of different object identities in the same format as positive_pairs. """ # Compute features. train_x, train_y = [], [] if self._does_use_tsn: for time_gap, box1, feature1, tsn_feature1, box2, feature2, tsn_feature2 in positive_pairs: train_x.append( compute_pairwise_transition_features_with_tsn( time_gap, box1[np.newaxis, :], feature1[np.newaxis, :], tsn_feature1[np.newaxis, :], box2[np.newaxis, :], feature2[np.newaxis, :], tsn_feature2[np.newaxis, :]).ravel()) train_y.append(1) for time_gap, box1, feature1, tsn_feature1, box2, feature2, tsn_feature2 in negative_pairs: train_x.append( compute_pairwise_transition_features_with_tsn( time_gap, box1[np.newaxis, :], feature1[np.newaxis, :], tsn_feature1[np.newaxis, :], box2[np.newaxis, :], feature2[np.newaxis, :], tsn_feature2[np.newaxis, :]).ravel()) train_y.append(0) else: for time_gap, box1, feature1, box2, feature2 in positive_pairs: train_x.append( compute_pairwise_transition_features( time_gap, box1[np.newaxis, :], feature1[np.newaxis, :], box2[np.newaxis, :], feature2[np.newaxis, :]).ravel()) train_y.append(1) for time_gap, box1, feature1, box2, feature2 in negative_pairs: train_x.append( compute_pairwise_transition_features( time_gap, box1[np.newaxis, :], feature1[np.newaxis, :], box2[np.newaxis, :], feature2[np.newaxis, :]).ravel()) train_y.append(0) # Shuffle data and train classifier. indices = np.random.permutation(len(train_x)) train_x = np.asarray(train_x)[indices, :] train_y = np.asarray(train_y)[indices] self._classifier.fit(train_x, train_y) def compute_cost(self, time_gap, boxes1, features1, tsn_features1, boxes2, features2, tsn_features2): """Compute transition cost from given features. Parameters ---------- time_gap : int It is assumed that all detections in boxes1 have been obtained at the same time step. Likewise, all detections in boxes2 have to be obtained at the same time step. The time_gap is the number of time steps inbetween the two times (successor time index minus predecessor time index). boxes1 : ndarray The first Nx4 dimensional array of bounding box coordinates in format (top-left-x, top-right-y, width, height). features1 : ndarray The first NxL dimensional array of N appearance features of length L. tsn_features1 : ndarray The first NxL dimensional array of N action features of length L. boxes2 : ndarray The second Mx4 dimensional array of bounding box coordinates in format (top-left-x, top-right-y, width, height). features2 : ndarray The second MxL dimensional array of M appearance features of length L. tsn_features2 : ndarray The second NxL dimensional array of N action features of length L. Returns ------- ndarray Returns the NxM dimensional matrix of element-wise transition costs where element (i, j) contains the transition cost between boxes1[i] and boxes2[j]. """ if self._does_use_tsn: features = compute_pairwise_transition_features_with_tsn( time_gap, boxes1, features1, tsn_features1, boxes2, features2, tsn_features2) else: features = compute_pairwise_transition_features( time_gap, boxes1, features1, boxes2, features2) log_probabilities = self._classifier.predict_log_proba( features.reshape(len(boxes1) * len(boxes2), features.shape[-1])) return -log_probabilities[:, 1].reshape(len(boxes1), len(boxes2))
# Oversample Oversampling1000 = train.loc[train.money == 1000] Oversampling1500 = train.loc[train.money == 1500] Oversampling2000 = train.loc[train.money == 2000] for i in range(5): train = train.append(Oversampling1000) for j in range(8): train = train.append(Oversampling1500) for k in range(10): train = train.append(Oversampling2000) # model clf = GradientBoostingClassifier(n_estimators=300, random_state=2016) # clf = RandomForestClassifier(n_estimators=500,random_state=2016) clf = clf.fit(train[predictors], train[target]) result = clf.predict(test[predictors]) result_p = clf.predict_log_proba(test[predictors]) # result_pre=clf.staged_predict_proba(test[predictors]) print 'feature_important:', clf.feature_importances_ # Save results test_result = pd.DataFrame(columns=["id", 'subsidy']) test_result.id = ids test_result.subsidy = result test_result.subsidy = test_result.subsidy.apply(lambda x: int(x)) # test_result.pre=result_pre result_compare = pd.merge(test_pre, test_result, on='id') from sklearn.metrics import f1_score print result_compare.money.values print result_compare.subsidy.values print f1_score(result_compare.money.values,