def UncertaintyEstimatesFromClassifiers(): from sklearn.ensemble import GradientBoostingClassifier from sklearn.datasets import make_circles import numpy as np from sklearn.model_selection import train_test_split X, y = make_circles(noise=0.25, factor=0.5, random_state=1) #We rename the class "blue" and "red" for illustration purposes y_named = np.array(['blue', 'red'])[y] #We can call train_test_split with arbitrarily many arrays; #all will split in a consistent manner X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split( X, y_named, y, random_state=0) #Build the gradient boosting model gbrt = GradientBoostingClassifier(random_state=0) gbrt.fit(X_train, y_train_named) print('X_test.shape: {}'.format(X_test.shape)) print('Decision function shape: {}'.format( gbrt.decision_function(X_test).shape)) #Show the first few entries of decision_function print('Decision function: \n{}'.format(gbrt.decision_function(X_test[:6]))) print('Threshold decision function:\n{}'.format( gbrt.decision_function(X_test) > 0)) print('Predictions:\n{}'.format(gbrt.predict(X_test)))
def uncertainty_multiclass_clf(): iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42) gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0) gbrt.fit(X_train, y_train) print("Decision function shape: {}".format( gbrt.decision_function(X_test).shape)) # (38, 3) print("Decision function:\n{}".format( gbrt.decision_function(X_test)[:6, :])) # recover predictions from these scores by finding max entry for each data point: print('Argmax of decision function:\n{0}'.format( np.argmax(gbrt.decision_function(X_test), axis=1))) print('Predictions:\n{0}'.format(gbrt.predict(X_test))) # Probabilities: print("Predicted probabilities:\n{}".format( gbrt.predict_proba(X_test)[:6])) print("Sums: {}".format(gbrt.predict_proba(X_test)[:6].sum( axis=1))) # Sums: [ 1. 1. 1. ... 1.] print("Argmax of predicted probabilities:\n{}".format( np.argmax(gbrt.predict_proba(X_test), axis=1))) print("Predictions:\n{}".format(gbrt.predict(X_test)))
def salary_predictions(): from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, recall_score, auc, roc_curve, precision_score from sklearn.ensemble import GradientBoostingClassifier df = pd.DataFrame() df["degree_cent"] = nx.degree_centrality(G).values() df["clustering"] = nx.clustering(G) df["closeness"] = nx.closeness_centrality(G, normalized=True).values() df["betweenness"] = nx.betweenness_centrality(G, normalized=True, endpoints=False, k=200).values() dep = [x[1]["Department"] for x in G.nodes(data=True)] man_salary = [x[1]["ManagementSalary"] for x in G.nodes(data=True)] df["department"] = dep df["management salary"] = man_salary #Separate the data with management salary reported from the rows where no salary is reported salary_reported = df.dropna() salary_not_reported = df[df["management salary"].isnull()] x = salary_reported.drop("management salary", axis=1) y = salary_reported["management salary"] test_df = salary_not_reported.drop("management salary", axis=1) #Training the gradient boosting model X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.9, random_state=0) gbm = GradientBoostingClassifier(random_state=0, learning_rate=0.1, n_estimators=45, max_depth=5).fit(X_train, y_train) y_score_eval = gbm.decision_function(X_test) y_proba_eval = gbm.predict_proba(X_test) y_score = gbm.decision_function(test_df) y_proba = gbm.predict_proba(test_df) fpr, tpr, _ = roc_curve(y_test, y_score_eval) roc_auc = auc(fpr, tpr) prob_management_salary = pd.Series(y_proba[:, 1]) prob_management_salary.index = test_df.index return prob_management_salary
def classConfidence(): iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42) gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0) gbrt.fit(X_train, y_train) print("Decision Func shape:{}".format( gbrt.decision_function(X_test).shape)) print("Decision Func:{}".format(gbrt.decision_function(X_test)[:6, :])) print("Argmax of decision func:\n{}".format( np.argmax(gbrt.decision_function(X_test), axis=1))) print("Predictions:\n{}".format(gbrt.predict(X_test)))
def test_sum_match_gradient_boosting_classifier(): import shap import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier import sklearn X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.adult(), test_size=0.2, random_state=0) clf = GradientBoostingClassifier(random_state=202, n_estimators=10, max_depth=10) clf.fit(X_train, Y_train) # Use decision function to get prediction before it is mapped to a probability predicted = clf.decision_function(X_test) # check SHAP values ex = shap.TreeExplainer(clf) initial_ex_value = ex.expected_value shap_values = ex.shap_values(X_test) assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \ "SHAP values don't sum to model output!" # check initial expected value assert np.abs(initial_ex_value - ex.expected_value) < 1e-6, "Inital expected value is wrong!" # check SHAP interaction values shap_interaction_values = ex.shap_interaction_values(X_test.iloc[:10, :]) assert np.abs(shap_interaction_values.sum(1).sum(1) + ex.expected_value - predicted[:10]).max() < 1e-6, \ "SHAP interaction values don't sum to model output!"
def blight_model(): from sklearn.model_selection import train_test_split train=pd.read_csv('train.csv',encoding = 'ISO-8859-1',usecols=['ticket_id','compliance','fine_amount','judgment_amount','hearing_date','ticket_issued_date']) train_blight=train[(train['compliance']==1) | (train['compliance']==0)] train_blight['hearing_date']=pd.to_datetime(train_blight['hearing_date'].fillna('1900-01-01 00:00:00')).dt.date train_blight['ticket_issued_date']=pd.to_datetime(train_blight['ticket_issued_date'].fillna('1900-01-01 00:00:00')).dt.date train_blight['gap']=train_blight['hearing_date']-train_blight['ticket_issued_date'] train_blight['gap']=train_blight['gap'].fillna(pd.Timedelta('-1 days')).dt.days test_blight=pd.read_csv('test.csv',encoding = 'ISO-8859-1',usecols=['ticket_id','fine_amount','judgment_amount','hearing_date','ticket_issued_date']) test_blight['hearing_date']=pd.to_datetime(test_blight['hearing_date'].fillna('1900-01-01 00:00:00')).dt.date test_blight['ticket_issued_date']=pd.to_datetime(test_blight['ticket_issued_date'].fillna('1900-01-01 00:00:00')).dt.date test_blight['gap']=test_blight['hearing_date']-test_blight['ticket_issued_date'] test_blight['gap']=test_blight['gap'].fillna(pd.Timedelta('-1 days')).dt.days feature_names2=['fine_amount','judgment_amount','gap'] X=train_blight[feature_names2] y=train_blight['compliance'] X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0) from sklearn.ensemble import GradientBoostingClassifier clGrad=GradientBoostingClassifier().fit(X_train,y_train) clGrad.score(X_train,y_train) clGrad.score(X_test,y_test) # Your code here return pd.Series(clGrad.decision_function(test_blight[feature_names2]),index=test_blight['ticket_id'])
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X) ).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1] ).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_ ).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def d(): X, y = make_circles(noise=0.25, factor=0.5, random_state=1) # we rename the classes "blue" and "red" for illustration purposes y_named = np.array(["blue", "red"])[y] # we can call train_test_split with arbitrarily many arrays; # all will be split in a consistent manner X_train, X_test, y_train, y_test, y_train, y_test = \ train_test_split(X, y_named, y, random_state=0) # build the gradient boosting model gbrt = GradientBoostingClassifier(random_state=0) gbrt.fit(X_train, y_train) # print("X_test.shape: {}".format(X_test.shape)) # print("Decision function shape: {}".format( # gbrt.decision_function(X_test).shape)) # show the first few entries of decision_function # print("Thresholded decision function:\n{}".format(gbrt.decision_function(X_test) > 0)) greater_zero = (gbrt.decision_function(X_test) > 0).astype(int) print("分类类型 :\n{}".format(gbrt.classes_)) print("Shape of probabilities: \n{}".format(gbrt.predict_proba(X_test))) print("decision function: \n{}".format(gbrt.decision_function(X_test))) print("Predictions:\n{}".format(gbrt.predict(X_test))) # print("pred is equal to predictions:{}".format(np.all(gbrt.classes_[greater_zero] == gbrt.predict(X_test)))) """ 决策边界 decision_function it returns one floating-point number for each sample 预测可能性 predict_proba a probability for each class, and is often more easily understood than the output of decision_function 列举说有分类的可能性 sum=1 大于50%作为predict结果 overfit and 复杂 预测的准确性更高 """ # plot_dicision(X, X_test, X_train, gbrt, y_test, y_train) plot_probla(X, X_test, X_train, gbrt, y_test, y_train)
def learn(trainX1, cvX1, trainy1, cvy1): # use Gradient Boosting to learn trainX1 = normalize(trainX1) cvX1 = normalize(cvX1) gradboot = GradientBoostingClassifier().fit(trainX1, trainy1) # calculate AUC-ROC score y_score = gradboot.decision_function(cvX1) fpr, tpr, _ = roc_curve(cvy1, y_score) roc_auc = auc(fpr, tpr) return roc_auc
def gradient_booster(): gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0) gbrt.fit(X_train, y_train) print("Decision function shape: {}".format( gbrt.decision_function(X_test).shape)) # plot the first few entries of the descision function print("Decisioon function:\n{}".format( gbrt.decision_function(X_test)[:6, :])) print("Argmax of decision functions:\n{}".format( np.argmax(gbrt.decision_function(X_test), axis=1))) print("Predictions:\n{}".format(gbrt.predict(X_test))) # show first few entries of predict_proba print("Predicted probabilities:\n{}".format( gbrt.predict_proba(X_test)[:6])) # show that sums accros rows are one print("Sums: {}".format(gbrt.predict_proba(X_test)[:6].sum(axis=1))) print("Argmax of predicted probabilities:\n{}".format( np.argmax(gbrt.predict_proba(X_test), axis=1))) print("Predictions:\n{}".format(gbrt.predict(X_test)))
def test_max_feature_regression(): # Test to make sure random state is set properly. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5, max_depth=2, learning_rate=.1, max_features=2, random_state=1) gbrt.fit(X_train, y_train) deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test)) assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)
def test_max_feature_regression(): # Test to make sure random state is set properly. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5, max_depth=2, learning_rate=.1, max_features=2, random_state=1) gbrt.fit(X_train, y_train) deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test)) assert deviance < 0.5, "GB failed with deviance %.4f" % deviance
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def test_single_row_gradient_boosting_classifier(): import shap import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier import sklearn X_train,X_test,Y_train,_ = train_test_split(*shap.datasets.adult(), test_size=0.2, random_state=0) clf = GradientBoostingClassifier(random_state=202, n_estimators=10, max_depth=10) clf.fit(X_train, Y_train) predicted = clf.decision_function(X_test) ex = shap.TreeExplainer(clf) shap_values = ex.shap_values(X_test.iloc[0,:]) assert np.abs(shap_values.sum() + ex.expected_value - predicted[0]) < 1e-4, \ "SHAP values don't sum to model output!"
def in_101(): from sklearn.datasets import load_iris from sklearn.ensemble import GradientBoostingClassifier iris = load_iris() x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42) gbrt = GradientBoostingClassifier(learning_rate=0.1, random_state=0) gbrt.fit(x_train, y_train) print(gbrt.decision_function(x_test[0:5])) print(gbrt.predict_proba(x_test[0:5])) print(y_train[0]) for i in y_train[0:5]: print(iris.target_names[i])
def boosting(X_train, y_train, X_test, y_test): seed = 7 num_trees = 100 kfold = model_selection.KFold(n_splits=10, random_state=seed) model = GradientBoostingClassifier( n_estimators=num_trees, random_state=seed ).fit( X_train, y_train ) # of model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) results = model.score(X_test, y_test) y_df = model.decision_function(X_test) y_pred = model.predict(X_test) precicions, recall, t = precision_recall_curve(y_test, y_df, pos_label=1) print(precicions[:10], recall[:10], t[:10]) precision = precicions[0] confmat = confusion_matrix(y_test, y_pred) return results, precision, confmat
def gdb_machine(df_list, label_list, penalty=1, scale=False): random_state = np.random.RandomState(20180213) gdb_results = { 'prediction': [], 'probaility': [], 'y_test': [], 'y_score': [] } try: if scale: df_list = [scale_df(df) for df in df_list] print('DF Scaling successful.') except: raise ValueError('Failed to execute DF Scaling.') for x, y in zip(df_list, label_list): try: x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=.2, random_state=random_state) except: raise ValueError('Train/Test split failed.') #df = pd.DataFrame(x_train).assign(outcome=y_train) #df = pd.concat([df[df.outcome==1].sample(n=5 * sum(np.array(y_train)==0)), df[df.outcome==0]]) #x_train = df.drop(['outcome']) #y_train = df.outcome #del df gdb = GradientBoostingClassifier(n_estimators=7, max_depth=6, min_samples_split=1780, min_samples_leaf=1, random_state=20180320, max_features=310, subsample=0.8, learning_rate=0.11) weighting = lambda x: 1 if x else penalty gdb.fit(x_train, y_train, sample_weight=[weighting(i) for i in y_train]) gdb_results['y_test'].append(y_test) gdb_results['prediction'].append(gdb.predict(x_test)) gdb_results['probaility'].append(gdb.predict_proba(x_test)[::, 1]) gdb_results['y_score'].append(gdb.decision_function(x_test)) return gdb_results
class GradientBoosting(Processor): def __init__(self, name='rfe', c=1.0, n_estimators=100, keys_correspondences=DEFAULT_KEYS_CORRESPONDENCES): super(GradientBoosting, self).__init__(name) self._model = GradientBoostingClassifier(LinearSVC(C=c), n_estimators=n_estimators, max_depth=1000) self.keys_correspondences = keys_correspondences def to_dict(self): output_dict = { 'data': np.array(pickle.dumps(self._model)), } return output_dict def from_dict(self, dict): self._model = pickle.loads(dict['data']) def fit(self, x): labels_key = self.keys_correspondences["labels_key"] features_key = self.keys_correspondences["features_key"] labels = copy.deepcopy(x[labels_key]) labels[labels > 0] = 1 self._model.fit(x[features_key], labels) def run(self, x): features_key = self.keys_correspondences["features_key"] scores_key = self.keys_correspondences["scores_key"] output_type_key = self.keys_correspondences["output_type_key"] x[scores_key] = self._model.decision_function(x[features_key]) x[output_type_key] = ProcessorOutputType.LIKELIHOOD return x def __str__(self): description = { 'type': 'Gradient boosting processor', 'name': self.name } return str(description)
def test_probability_exponential(): """Predict probabilities.""" clf = GradientBoostingClassifier(loss="exponential", n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert np.all(y_proba >= 0.0) assert np.all(y_proba <= 1.0) score = clf.decision_function(T).ravel() assert_array_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score))) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def Uncertainty_eval(): '''test''' X, y = make_circles(noise=0.25, factor=0.5, random_state=1) y = np.array(["blue", "red"])[y] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) gb = GradientBoostingClassifier(learning_rate=0.01, random_state=0) gb.fit(X_train, y_train) print("Decusuib functions: \n{}".format(gb.decision_function(X_test)[:6])) print(gb.classes_) print("Decusuib functions: \n{}".format(gb.predict_proba(X_test)[:6])) iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42) gb.fit(X_train, y_train) print(gb.score(X_test, y_test), '\n', np.argmax(gb.predict_proba(X_test), axis=1))
def gradboost(n_trees): iris_dataset = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0) x = [] y = [] for i in range(n_trees): knn = GradientBoostingClassifier(random_state=0, n_estimators=i + 1, max_depth=1) knn.fit(X_train, y_train) x.append(knn.score(X_train, y_train)) y.append(knn.score(X_test, y_test)) I = [i + 1 for i in range(n_trees)] plt.figure() plt.plot(I, x, 'r', I, y, 'k') plt.show() print(knn.decision_function(X_test)[0][0]) print(knn.predict_proba(X_test)[0][0])
class GradientBoostingClassifierImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def test_probability_exponential(): # Predict probabilities. clf = GradientBoostingClassifier(loss='exponential', n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert np.all(y_proba >= 0.0) assert np.all(y_proba <= 1.0) score = clf.decision_function(T).ravel() assert_array_almost_equal(y_proba[:, 1], 1.0 / (1.0 + np.exp(-2 * score))) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result)
def blight_model(): # Your code here # Read dataset df = pd.read_csv("train.csv", encoding='ISO-8859-1', low_memory=False) df = df[np.isfinite(df['compliance'])] df2 = pd.read_csv("test.csv") add_df = pd.read_csv("addresses.csv") lat_df = pd.read_csv("latlons.csv") # assign y,X for training and X2 for testing y = df['compliance'].values X = df[list(['judgment_amount', 'late_fee'])].values X2 = df2[list(['judgment_amount', 'late_fee'])].values # Split dataset into train and test/dev dataset using an inbuilt function of sklearn X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # training the data using Gradient Booster classifier clf = GradientBoostingClassifier().fit(X_train, y_train) # Predicting various score on the trained model y_score = clf.decision_function(X_test) print('Score training Set: ' + str(clf.score(X_train, y_train))) print('Score test set: ' + str(clf.score(X_test, y_test))) print('ROC_AUC Score:' + str(roc_auc_score(y_test, y_score)) + "\n") # Testing is important ,b ut measuring that is even more thus plotting precision recall curve for the model # Now the curve depicts that or model is more precision oriented than recall precision, recall, thresholds = precision_recall_curve(y_test, y_score) closest_zero = np.argmin(np.abs(thresholds)) closest_zero_p = precision[closest_zero] closest_zero_r = recall[closest_zero] plt.figure() plt.xlim([0.0, 1.01]) plt.ylim([0.0, 1.01]) plt.plot(precision, recall, label='Precision-Recall Curve') plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3) plt.xlabel('Precision', fontsize=16) plt.ylabel('Recall', fontsize=16) plt.axes().set_aspect('equal') plt.show() # plotting roc_auc graph fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score) roc_auc_lr = auc(fpr_lr, tpr_lr) plt.figure() plt.xlim([-0.01, 1.00]) plt.ylim([-0.01, 1.01]) plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr)) plt.xlabel('False Positive Rate', fontsize=16) plt.ylabel('True Positive Rate', fontsize=16) plt.title('ROC curve ', fontsize=16) plt.legend(loc='lower right', fontsize=13) plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--') plt.axes().set_aspect('equal') plt.show() ind = df2['ticket_id'].values #print(ind) pred2 = clf.predict_proba(X2)[:, 1] #print(pred) #print(np.shape(pred)) #print(np.shape(ind)) ans = pd.Series(pred2, ind, dtype='float64') return ans
def decision_function(): # circle数据集是一个大圆,一个小圆组成的数据集 # 准备有噪声的circle数据集 from sklearn.datasets import make_circles from sklearn.model_selection import train_test_split X, y = make_circles(noise=0.25, factor=0.5, random_state=1) y_named = np.array(['blue', 'red'])[y] X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split( X, y_named, y, random_state=seed) # 构建梯度提升模型 from sklearn.ensemble import GradientBoostingClassifier gbdt = GradientBoostingClassifier(random_state=seed) gbdt.fit(X_train, y_train_named) # 2.4.1. 决策函数 print('测试集的形状:{}'.format(X_test.shape)) decision_function_values = gbdt.decision_function(X_test) # 二分类问题的决策函数是一维数据 # ToDo:什么历史原因造成的? print('决策函数的形状:{}'.format(decision_function_values.shape)) print('决策函数计算测试集的输出值:\n{}'.format(decision_function_values)) print('决策函数计算测试集的输出值经过阈值判断的结果:\n{}'.format(decision_function_values > 0)) print('模型预测测试集的输出结果:\n{}'.format(gbdt.predict(X_test))) # 将布尔值转化为0和1 greater_zero = (decision_function_values > 0).astype(int) # 将0和1转化为类别名称 pred = gbdt.classes_[greater_zero] print('决策函数输出结果与模型计算测试集的输出结果是否相等: {}'.format( np.all(pred == gbdt.predict(X_test)))) decision_function_values = decision_function_values print('-' * 20) print("决策函数的输出很难解释。") print('决策函数的最小值: {:.2f} 与最大值: {:.2f}'.format( np.min(decision_function_values), np.max(decision_function_values))) fig, axes = plt.subplots(1, 2, figsize=(13, 5)) mglearn.tools.plot_2d_separator(gbdt, X, ax=axes[0], alpha=.4, fill=True, cm=mglearn.cm2) scores_image = mglearn.tools.plot_2d_scores(gbdt, X, ax=axes[1], alpha=.4, cm=mglearn.ReBl) from mglearn import discrete_scatter for ax in axes: discrete_scatter(X_test[:, 0], X_test[:, 1], y_test, markers=['^'], ax=ax) discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, markers=['o'], ax=ax) ax.set_xlabel('Feature 0') ax.set_ylabel('Feature 1') plt.colorbar(scores_image, ax=axes.tolist()) axes[0].legend( ['Test class 0', 'Test class 1', 'Train Class 0', 'Train Class 1'], ncol=4, loc=(.1, 1.1)) plt.title("图2-55 梯度提升模型在一个二维圆数据集上的决策边界(左)和决策函数(右)")
def multi_classes(): # 鸢尾花(iris)数据集是一个三分类数据集。 from sklearn.datasets import load_iris iris = load_iris() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=seed) from sklearn.ensemble import GradientBoostingClassifier gbdt = GradientBoostingClassifier(learning_rate=0.01, random_state=seed) gbdt.fit(X_train, y_train) print('=' * 20) print("使用 GBDT 对 iris 数据集进行学习") # 每一列对应每个类别的“确定度分类” # - 分数越高,则可能性越大 decision_func_values = gbdt.decision_function(X_test) print('-' * 20) print('决策函数输出值的形状:{}'.format(decision_func_values.shape)) print('决策函数的前六个输出值:\n{}'.format(decision_func_values[:6])) # argmax_decision_func = np.argmax(decision_func_values, axis = 1) argmax_decision_func = decision_func_values.argmax(axis=1) print('-' * 20) print('决策函数的输出值中的最大项:\n{}'.format(argmax_decision_func)) predict_prob = gbdt.predict_proba(X_test) print('-' * 20) print('预测概率输出值的形状:{}'.format(predict_prob.shape)) print('预测概率的前6个输出值:\n{}'.format(predict_prob[:6])) # argmax_predict_prob = np.argmax(predict_prob, axis = 1) argmax_predict_prob = predict_prob.argmax(axis=1) print('-' * 20) print('预测概率的输出值中的最大项:\n{}'.format(argmax_predict_prob)) predict_result = gbdt.predict(X_test) print('-' * 20) print('测试数据集的预测结果:\n{}'.format(predict_result)) print('-' * 20) print("决策函数的输出值中的最大项与测试数据集的预测结果是否相等?", np.all(argmax_decision_func == predict_result)) print("预测概率的输出值中的最大项与测试数据集的预测结果是否相等?", np.all(argmax_predict_prob == predict_result)) from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000) named_target = iris.target_names[y_train] log_reg.fit(X_train, named_target) print('=' * 20) print("使用 LogisticRegression 对 iris 数据集进行学习") print('-' * 20) print('训练数据集中的类别:{}'.format(log_reg.classes_)) decision_func_values = log_reg.decision_function(X_test) print('-' * 20) print('决策函数输出值的形状:{}'.format(decision_func_values.shape)) print('决策函数的前六个输出值:') print(decision_func_values[:6]) # argmax_dec_func = np.argmax(decision_func_values, axis = 1) argmax_dec_func = decision_func_values.argmax(axis=1) print('-' * 20) print('决策函数的输出值中的前十个最大项:') print(argmax_dec_func[:10]) print('利用分类器的classes_属性转换决策函数的输出值中的前十个最大项:') print(log_reg.classes_[argmax_dec_func][:10]) predict_prob = log_reg.predict_proba(X_test) print('-' * 20) print('预测概率输出值的形状:{}'.format(predict_prob.shape)) # argmax_predict_prob = np.argmax(predict_prob, axis = 1) argmax_predict_prob = predict_prob.argmax(axis=1) print('-' * 20) print('预测概率的输出值中的前十个最大项:') print(argmax_predict_prob[:10]) print('利用分类器的classes_属性转换预测概率的输出值中的前十个最大项:') print(log_reg.classes_[argmax_predict_prob][:10]) predict_result = log_reg.predict(X_test) print('-' * 20) print('测试数据集的前十个预测结果:') print(predict_result[:10]) print('-' * 20) print("决策函数的输出值中的最大项与测试数据集的预测结果是否相等?", np.all(log_reg.classes_[argmax_decision_func] == predict_result)) print("预测概率的输出值中的最大项与测试数据集的预测结果是否相等?", np.all(log_reg.classes_[argmax_predict_prob] == predict_result)) pass
print("Accuracy score (training): {0:.3f}".format( gb.score(X_train_sub, y_train_sub))) print("Accuracy score (validation): {0:.3f}".format( gb.score(X_validation_sub, y_validation_sub))) print() # Output confusion matrix and classification report of Gradient Boosting algorithm on validation set gb = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0) gb.fit(X_train_sub, y_train_sub) predictions = gb.predict(X_validation_sub) # ROC curve and Area-Under-Curve (AUC) y_scores_gb = gb.decision_function(X_validation_sub) fpr_gb, tpr_gb, _ = roc_curve(y_validation_sub, y_scores_gb) roc_auc_gb = auc(fpr_gb, tpr_gb) print("Area under ROC curve = {:0.2f}".format(roc_auc_gb)) print("Confusion Matrix:") print(confusion_matrix(y_validation_sub, predictions)) print() print("Classification Report") print(classification_report(y_validation_sub, predictions)) if __name__ == '__main__': pass
def gbdt_plus_liner_classifier_grid_search(stack_setting_, upper_param_keys=None, upper_param_vals=None, lower_param_keys=None, lower_param_vals=None, num_proc=None): """ upper model is GBDT or Random Forest lower model is Linear Classifier """ if stack_setting_ is None: sys.stderr.write('You have no setting Json file\n') sys.exit() if num_proc is None: num_proc = 6 upper_best_params = None lower_best_param = None # 1. upper model if upper_param_keys is None: upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf'] if upper_param_vals is None: upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]] # grid search for upper model : GBDT or Random Forest # ExperimentL1 has model free. On the other hand, data is fix exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'], train_fname = stack_setting_['0-Level']['train'], test_fname = stack_setting_['0-Level']['test']) model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_train_fname = os.path.join(Config.get_string('data.path'), model_folder, model_train_fname) model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] model_test_fname = os.path.join(Config.get_string('data.path'), model_folder, model_test_fname) upper_param_dict = dict(zip(upper_param_keys, upper_param_vals)) if os.path.isfile(model_train_fname) is False and \ os.path.isfile(model_test_fname) is False: #upper_param_dict['model_type'] == [GradientBoostingClassifier] del upper_param_dict['model_type'] clf = GradientBoostingClassifier() clf_cv = GridSearchCV(clf, upper_param_dict, verbose = 10, scoring = stack_setting_['1-Level']['gbdt_linear']['upper']['metrics'],#scoring = "precision" or "recall" or "f1" n_jobs = num_proc, cv = 5) X_train, y_train = exp.get_train_data() clf_cv.fit(X_train, y_train) upper_best_params = clf_cv.best_params_ print upper_best_params del clf_cv clf.set_params(**upper_best_params) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) X_test, y_test = exp.get_test_data() for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder'] graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name'] graph_fname = os.path.join(Config.get_string('data.path'), graph_folder, graph_fname) gs = GridSpec(2,2) ax1 = plt.subplot(gs[0,1]) ax2 = plt.subplot(gs[1:,1]) ax3 = plt.subplot(gs[:,0]) #ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') #ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') #ax1.set_xlabel('the number of weak learner') #ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) #ax1.legend(loc="best") confidence_score = clf.decision_function(X_test) #sns.distplot(confidence_score, kde=False, rug=False, ax=ax1) num_bins = 100 try: counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) except: counts, bin_edges = np.histogram(confidence_score, normed=True) cdf = np.cumsum(counts) ax1.plot(bin_edges[1:], cdf / cdf.max()) ax1.set_ylabel('CDF') ax1.set_xlabel('Decision_Function:Confidence_Score', fontsize=10) # dump for the transformated feature clf = TreeTransform(GradientBoostingClassifier(), best_params_ = upper_best_params) if type(X_train) == pd.core.frame.DataFrame: clf.fit(X_train.as_matrix().astype(np.float32), y_train) elif X_train == np.ndarray: clf.fit(X_train.astype(np.float32), y_train) # train result train_loss = clf.estimator_.train_score_ test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32) if type(X_train) == pd.core.frame.DataFrame: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) elif type(X_train) == np.ndarray: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) ax2.plot(train_loss, label="train_loss") ax2.plot(test_loss, label="test_loss") ax2.set_xlabel('Boosting Iterations') ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax2.legend(loc="best") # tree ensambles score_threshold=0.8 index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values)) feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]] feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index] fis = pd.DataFrame( {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index], 'score':feature_importances_score} ) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) # where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) sns.barplot(x = 'score', y = 'name', data = fis, ax=ax3, color="blue") ax3.set_xlabel("Feature_Importance", fontsize=10) plt.tight_layout() plt.savefig(graph_fname) plt.close() #print clf.toarray().shape # >(26049, 100) #input_features = 26049, weak_learners = 100 #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0] #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:] ## feature transformation : get test data from train trees #print transformated_train_features.shape, X_train.shape #print transformated_test_features.shape, X_test.shape transformated_train_features = clf.one_hot_encoding if type(X_test) == pd.core.frame.DataFrame: transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), y_test) elif type(X_train) == np.ndarray: transformated_test_features = clf.transform(X_test, y_test) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] #model_train_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_train_fname) with gzip.open(model_train_fname, "wb") as gf: cPickle.dump([transformated_train_features, y_train], gf, cPickle.HIGHEST_PROTOCOL) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] #model_test_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_test_fname) with gzip.open(model_test_fname, "wb") as gf: cPickle.dump([transformated_test_features, y_test], gf, cPickle.HIGHEST_PROTOCOL) # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) clf_lower_model = None clf_lower_mname = None # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix if lower_param_dict['model_type'] == [LogisticRegression] and lower_param_dict['penalty'] == ['l1']: # Logistic Regression clf_lower_model = LogisticRegression() clf_lower_mname = 'LR-L1' elif lower_param_dict['model_type'] == [LogisticRegression] and lower_param_dict['penalty'] == ['l2']: # Logistic Regression clf_lower_model = LogisticRegression() clf_lower_mname = 'LR-L2' elif lower_param_dict['model_type'] == [LinearSVC] and lower_param_dict['penalty'] == ['l1']: # SVM L1 clf_lower_model = LinearSVC() clf_lower_mname = 'SVM-L1' elif lower_param_dict['model_type'] == [LinearSVC] and lower_param_dict['penalty'] == ['l2']: # SVM L1 clf_lower_model = LinearSVC() clf_lower_mname = 'SVM-L2' else: sys.stderr.write("You should input lower liner model\n") sys.exit() model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature meta_train_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1] ) meta_test_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1] ) meta_header_ = "%s_%s,%s" % ( ",".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'].split(",")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'].split(",")[-1] ) exp.write2csv_meta_feature( model = clf_lower_model, meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = meta_train_fname_, meta_test_fname = meta_test_fname_, meta_header = meta_header_, best_param_ = lower_best_param ) ## best parameter for GBDT and anohter sklearn classifier #return best_param, best_score if upper_best_params is None: upper_best_params = stack_setting_['1-Level']['gbdt_linear']['upper']['best_parameter'] return upper_best_params, lower_best_param
X_train, X_test, y_train, y_test, ind_train, ind_test = load_data(full=False) clf = GradientBoostingClassifier(n_estimators=500, max_depth=6, learning_rate=0.1, max_features=256, min_samples_split=15, verbose=3, random_state=13) print('_' * 80) print('training') print print clf clf.fit(X_train, y_train) if y_test is not None: from sklearn.metrics import auc_score print clf y_scores = clf.decision_function(X_test).ravel() print "AUC: %.6f" % auc_score(y_test, y_scores) if generate_report: from error_analysis import error_report data = np.load("data/train.npz") X = data['X_train'] X_test_raw = X[ind_test] error_report(clf, X_test_raw, y_test, y_scores=y_scores, ind=ind_test) np.savetxt("gbrt3.txt", clf.decision_function(X_test))
class ClassifierModeling: def __init__(self, model_name, X_train=None, y_train=None, X_test=None, y_test=None, kfold=None): self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.kfold = kfold self.y_pred = None self.model_name = model_name if self.model_name == "RandomForestClassifier": self.model = RandomForestClassifier() elif self.model_name == "LogisticRegression": self.model = LogisticRegression(solver='saga', random_state=0) elif self.model_name == "DecisionTreeClassifier": self.model = DecisionTreeClassifier() elif self.model_name == "XG_Boost": data_dmatrix = xgb.DMatrix(data=self.X_train, label=self.y_train) self.model = xgb.XGBClassifier() print() elif self.model_name == "Multilayer Perceptron": print("not implemented yet") elif self.model_name == "svm": self.model = svm.SVC(kernel='linear', C=0.01) elif self.model_name == "adboost": self.model = AdaBoostClassifier() elif self.model_name == "gradienBoost": self.model = GradientBoostingClassifier() def fit(self): print("fitting the ", self.model_name) self.model.fit(self.X_train, self.y_train) def get_predicate(self): print("predicting by ", self.model_name) self.y_pred = pd.Series(self.model.predict(self.X_test), name="predict") return self.y_pred def get_MSE(self): return mean_squared_error(self.y_test, self.y_pred) def get_score(self): return -(r2_score(self.y_test, self.y_pred)) def get_loss(self): return np.sqrt(mean_squared_error(self.y_test, self.y_pred)) def validate_model(self): print("validate the model") model_fit = pd.DataFrame() model_fit = pd.concat([self.y_pred, self.y_test], axis=1) matrix = confusion_matrix(self.y_test, self.y_pred) fig, axs = plt.subplots(1, 3, squeeze=False, figsize=(15, 3)) plt.rcParams.update({'font.size': 10}) d = plot_confusion_matrix(self.model, self.X_test, self.y_test, display_labels=["yes", "no"], cmap=plt.cm.Blues, ax=axs[0, 2]) d.ax_.set_title("{} confusion matrix".format(self.model_name)) total = float(len(model_fit)) for ax in axs.flatten(): plt.rcParams.update({'font.size': 16}) for i, var in enumerate(model_fit.columns): ax = sns.countplot(var, data=model_fit, ax=axs[0][i]) ax.set_title(self.model_name) for p in ax.patches: percentage = '{:.1f}%'.format(100 * p.get_height() / total) x = p.get_x() + p.get_width() y = p.get_height() ax.annotate(percentage, (x, y), ha='center') #fig.savefig("https://github.com/muluwork-shegaw/10Academy-week6/blob/master/data/{}.png".format(self.model_name)) return matrix, model_fit def get_eff_model(self): if self.model_name != "svm": print("calculate model performance ") metrics = pd.DataFrame() metrics["model"] = [self.model_name] metrics["MSE"] = mean_squared_error(self.y_test, self.y_pred) metrics["Loss"] = np.sqrt( mean_squared_error(self.y_test, self.y_pred)) metrics["Score"] = -(r2_score(self.y_test, self.y_pred)) metrics["Kappa"] = cohen_kappa_score(self.y_test, self.y_pred) metrics["ROC_Auc"] = roc_auc_score(self.y_test, self.y_pred) metrics["precision"] = precision_score(self.y_test, self.y_pred) metrics["recall"] = recall_score(self.y_test, self.y_pred) metrics["f1_score"] = f1_score(self.y_test, self.y_pred) metrics["accuracy"] = accuracy_score(self.y_test, self.y_pred) return metrics def get_accuracy_with_kfold(self): return cross_val_score(self.model, self.X_test, self.y_test, cv=self.kfold, scoring='accuracy').mean() def get_loss_with_kfold(self, valid_data, valid_targ, k_fold): return -(cross_val_score(self.model, self.X_test, self.y_test, cv=self.kfold, scoring='neg_log_loss').mean()) def eff_model_with_kfold(self): if self.model_name != "svm": print("calculate model performance with stratified k_fold") scoring = [ "accuracy", "roc_auc", "neg_log_loss", "r2", "neg_mean_squared_error", "neg_mean_absolute_error" ] metrics = pd.DataFrame() metrics["model"] = [self.model_name] for scor in scoring: score = [] result = cross_val_score(estimator=self.model, X=self.X_test, y=self.y_test, cv=self.kfold, scoring=scor) score.append(result.mean()) metrics[scor] = pd.Series(score) return metrics def get_feature_impo(self): if self.model_name != "LogisticRegression": feat_importance = pd.Series(self.model.feature_importances_, index=self.X_train.columns) feat_importance.plot(kind='bar') plt.show() return feat_importance def get_summary(self): # for feature importance of logistic regression if self.model_name == "LogisticRegression": denom = (2.0 * (1.0 + np.cosh(self.model.decision_function(X)))) denom = np.tile(denom, (X.shape[1], 1)).T F_ij = np.dot((X / denom).T, X) ## Fisher Information Matrix Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao)) z_scores = self.model.coef_[ 0] / sigma_estimates # z-score for eaach model coefficient p_values = [stat.norm.sf(abs(x)) * 2 for x in z_scores] ### two tailed test for p-values z_scores = z_scores p_values = p_values sigma_estimates = sigma_estimates F_ij = F_ij summary = pd.DataFrame() summary["features"] = self.X_train.columns summary["z_score"] = self.z_scores summary["p_value"] = self.p_values sns.barplot(summary["features"], summary["p_value"], data=summary) return summary def save_model(self): now = datetime.datetime.now().strftime('%Y-%m-%d') # Saving model to disk filename = now + '.pkl' pickle.dump(self.model, open(filename, 'wb')) return filename ''' use stratified k-fold cross-validation with imbalanced datasets to preserve the class distribution in the train and test sets for each evaluation of a given model. ''' def make_it_stratified(self, data, target, reduction_model='pca', dim=7, show=False): X = data.drop(target, axis=1) y = data[target] eff = [] model_pred = [] kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) #enumerate the splits and summarize the distributions i = 0 for train_ix, test_ix in kfold.split(X, y): i = i + 1 print("k_fold -{} with {} model".format(i, self.model_name)) if reduction_model == 'pca': # using PCA pca = PCA(n_components=7) reduced_df = pca.fit_transform( X) # reduce the dimention and convert to data frame # columns=[f'pca {i}' for i in range(1,8)]) elif reduction_model == 'tsne': # using TSNE tsne = TSNE(n_components=7, n_iter=300) # select rows self.X_train, self.X_test = reduced_df[train_ix], reduced_df[ test_ix] self.y_train, self.y_test = y[train_ix], y[test_ix] self.fit() self.get_predicate() if show == True: matrix, model_fit = self.validate_model() model_pred.append(model_fit) eff.append(self.get_eff_model()) df_eff = pd.concat(eff) df_eff = pd.DataFrame(df_eff.mean()).transpose() df_eff.index = [self.model_name] return df_eff, model_pred
histogram_base.SetTitle("") histogram_base.SetStats(False) histogram_base.SetMinimum(0.001) histogram_base.SetMaximum(10.) histogram_base.GetXaxis().SetTitle("Signal Eff.") histogram_base.GetYaxis().SetTitle("Background Eff.") histogram_base.Draw("hist") x_train = array.array("f", [0]) y_train = array.array("f", [0]) x_test = array.array("f", [0]) y_test = array.array("f", [0]) effs = np.linspace(0, 1, 50) train_scores = cls.decision_function(d_train) fpr_tr, tpr_tr, tresholds_tr = sklearn.metrics.roc_curve(t_train, train_scores, pos_label=None) for eff in effs: #print 'Fake rate at signal eff', eff, fpr_tr[np.argmax(tpr_tr>eff)] x_train.append(eff) y_train.append(fpr_tr[np.argmax(tpr_tr > eff)]) print 'from test sample' test_scores = cls.decision_function(d_test) fpr_te, tpr_te, tresholds_te = sklearn.metrics.roc_curve(t_test, test_scores, pos_label=None) for eff in effs: #print 'Fake rate at signal eff', eff, fpr_te[np.argmax(tpr_te>eff)]
import matplotlib.pyplot as plt import pandas as pd import mglearn from sklearn.datasets import load_iris from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, \ iris.target, random_state=42) gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0) gbrt.fit(X_train, y_train) print("Decision function shape: {}".format( gbrt.decision_function(X_test).shape)) print("Decision function: {}".format(gbrt.decision_function(X_test[:6, :]))) print("Argmax of decision function:\n{}".format(\ np.argmax(gbrt.decision_function(X_test), axis=1))) print("Predictions:\n{}".format(gbrt.predict(X_test))) print("Predicted probabilities:\n{}".format(gbrt.predict_proba(X_test[:6]))) print("Sums: {}".format(gbrt.predict_proba(X_test[:6]).sum(axis=1))) print("Argmax of predicted probabilities:\n{}".format(\ np.argmax(gbrt.predict_proba(X_test), axis=1))) print("Predictions:\n{}".format(gbrt.predict(X_test))) logreg = LogisticRegression() named_target = iris.target_names[y_train]
import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.datasets import make_blobs, make_circles from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import mglearn X, y = make_circles(noise=0.25, factor=0.5) named_y = np.array(['blue', 'red'])[y] X_train, X_test, y_train, y_test, named_y_train, named_y_test = \ train_test_split(X, y, named_y) gbc = GradientBoostingClassifier().fit(X_train, named_y_train) print gbc.predict(X_test) print 'X_test shape:', X_test.shape print 'Decision function shape:', gbc.decision_function(X_test).shape print 'Decision function:', gbc.decision_function(X_test) print 'Probabilities:', gbc.predict_proba(X_test) fig, axes = plt.subplots(1, 3, figsize=(20, 5)) mglearn.tools.plot_2d_separator(gbc, X, ax=axes[0], alpha=0.4, fill=True, cm=mglearn.cm2) scores_images_df = mglearn.tools.plot_2d_scores(gbc, X, ax=axes[1], alpha=.4, cm=mglearn.ReBl)
return ceo ceo_lst = [] for sent in ceo_sentences: ceo = return_ceos(sent) ceo_lst.extend(ceo) ceo_lst = list(set(ceo_lst)) ceo_df = pd.DataFrame({'ceo': ceo_lst}) ceo_df.to_csv('results/ceo_matches.csv') ## CEO HIGH CONFIDENCE FALSE POSITIVES # including high confidence false positives as these may be # ceos not included in the training set ceo_conf_train = gbc.decision_function(X_per_train) ceosFP_train = y_per_train[(y_per_train != y_train_gbm) & (y_train_gbm==1) & (ceo_conf_train >= 2.0)] ceo_conf_test = gbc.decision_function(X_per_test) ceosFP_test = y_per_test[(y_per_test != y_test_gbm) & (y_test_gbm==1) & (ceo_conf_test >= 2.0)] ceosFP_idx = ceosFP_train.index.append(ceosFP_test.index).to_list() ceoFP_df = person_df.iloc[ceosFP_idx] ceoFP_sent = [nlp(sent) for sent in ceoFP_df.sentences.to_list()] ceoFP_lst =[] for sent in ceoFP_sent: ceoFP = return_ceos(sent) ceoFP_lst.extend(ceoFP) ceoFP_lst = list(set(ceoFP_lst))
from sklearn.ensemble make_blobs, make_circles X, y = make_circles(noise=0.25, factor=o.5, random_state=1) X y # rename the y label using blue and red y_named = np.array(['blue', 'red'])[y] X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split(X, y_named, y, random_state=0) gbrt = GradientBoostingClassifier(random_state=0) gbrt.fit(X_train, y_train_named) # model method decision_function has shape (n_samples,) gbrt.decision_function(X_test) X_test.shape gbrt.decision_function(X_test).shape # for binary classification, the negative class is the first entry of the classes_ attribute gbrt.decision_function(X_test) > 0 gbrt.predict(X_test) # make true/false into 0 and 1 greater_zero = (gbrt.decision_function(X_test) > 0).astype(int) pred = gbrt.classes_[greater_zero] pred gbrt.predict(X_test) # these two are the same
def run(self): if not self.verify_data(): print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m") raise Exception("BadTrainingInputData") applyClassWeights = False if self.parameters['classifier'] == 'GradientBoostingClassifier': clf = GradientBoostingClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], subsample=self.parameters['subsample'], min_impurity_split=self.parameters['min_impurity_split'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'RandomForestClassifier': clf = RandomForestClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], bootstrap=self.parameters['bootstrap'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'ExtraTreesClassifier': clf = ExtraTreesClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], bootstrap=self.parameters['bootstrap'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier': rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0) clf0 = GradientBoostingClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], subsample=self.parameters['subsample'], min_impurity_split=self.parameters['min_impurity_split'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True clf = make_pipeline(rt, clf0) elif self.parameters['classifier'] == 'XGBClassifier': clf = XGBClassifier( learning_rate=self.parameters['learning_rate'], max_depth=self.parameters['max_depth'], n_estimators=self.parameters['n_estimators'], objective='binary:logitraw', colsample_bytree=self.parameters['colsample_bytree'], subsample=self.parameters['subsample'], min_child_weight=self.parameters['min_child_weight'], gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0, #reg_alpha=8, reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0, reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0, ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'MLPClassifier': classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']} clf = MLPClassifier(**classifierParams) elif self.parameters['classifier'] in ['SVC', 'LinearSVC']: ''' clf = SVC( C=1.0, cache_size=4000, class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=100000, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=True ) ''' bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False if self.parameters['classifier'] == 'LinearSVC': clf = LinearSVC( class_weight='balanced', dual=self.parameters['dual'], max_iter=self.parameters['max_iter'], C=self.parameters['C'], penalty=self.parameters['penalty'], loss=self.parameters['loss'], tol=self.parameters['tol'], verbose=True, ) else: # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000 clf = SVC( C=self.parameters['C'], cache_size=self.parameters['cache_size'], class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=self.parameters['degree'], gamma=self.parameters['gamma'], kernel=self.parameters['kernel'], max_iter=self.parameters['max_iter'], probability=False, random_state=None, shrinking=self.parameters['shrinking'], tol=self.parameters['tol'], verbose=True ) if bagged: n_estimators = bagged if 'bag_oversampling' in self.parameters: n_estimators = int(n_estimators * self.parameters['bag_oversampling']) clf0 = clf clf = BaggingClassifier( clf0, max_samples=1.0 / bagged, max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0, bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False, n_estimators=n_estimators, ) else: clf = AdaBoostClassifier( DecisionTreeClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], class_weight=self.parameters['class_weight'], criterion=self.parameters['criterion'], splitter=self.parameters['splitter'], max_features=self.parameters['max_features'], ), n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], algorithm=self.parameters['algorithm'], ) #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile: # clf = pickle.load(inputFile) # preprocessing print("transformation...") if 'scaler' in self.parameters: if self.parameters['scaler'] == 'standard': self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X']) elif self.parameters['scaler'] == 'minmax': self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X']) elif self.parameters['scaler'] == 'robust': self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X']) else: self.scaler = None else: self.scaler = None if self.scaler: self.data['train']['X'] = self.scaler.transform(self.data['train']['X']) self.data['test']['X'] = self.scaler.transform(self.data['test']['X']) # SHUFFLE all samples before self.shuffle = False if self.shuffle: print("shuffle input data...") for dataset in self.datasets: nSamples = self.data[dataset][self.varsets[0]].shape[0] randomPermutation = np.random.permutation(nSamples) for var in self.varsets: self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0) # LIMIT number of training samples # recommended to also shuffle samples before, because they are ordered by signal/background limitNumTrainingSamples = self.parameters['limit'] if (limitNumTrainingSamples > 0): print("limit training samples to:", limitNumTrainingSamples) #for dataset in self.datasets: # for var in self.varsets: # self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples] for dataset in self.datasets: self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False) # oversample upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None if upscale: upscalemax = self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal indices = [] for i in range(len(self.data['train']['sample_weight'])): #print(x) x= self.data['train']['sample_weight'][i] if self.data['train']['y'][i] > 0.5: x *= upscalesignal n = x * upscale # limit oversampling factor! if n > upscalemax: n=upscalemax if n<1: n=1 intN = int(n) indices += [i]*intN #floatN = n-intN #if floatN > 0: # if random.uniform(0.0,1.0) < floatN: # indices += [i] self.data['train']['X'] = self.data['train']['X'][indices] self.data['train']['y'] = self.data['train']['y'][indices] self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices] self.verify_data() # BALANCE weights # calculate total weights and class_weights nSig = len([x for x in self.data['train']['y'] if x >= 0.5]) nBkg = len([x for x in self.data['train']['y'] if x < 0.5]) print("#SIG:", nSig) print("#BKG:", nBkg) weightsSignal = [] weightsBackground = [] for i in range(len(self.data['train']['sample_weight'])): if self.data['train']['y'][i] < 0.5: weightsBackground.append(self.data['train']['sample_weight'][i]) else: weightsSignal.append(self.data['train']['sample_weight'][i]) weightsSignal.sort() weightsBackground.sort() totalWeightSignal = sum(weightsSignal) totalWeightBackground = sum(weightsBackground) signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight'] backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground print("SUM of weights for signal:", totalWeightSignal) print("SUM of weights for background:", totalWeightBackground) if applyClassWeights: print("re-weight signals by:", signalReweight) print("re-weight background by:", backgroundReweight) for i in range(len(self.data['train']['sample_weight'])): if self.data['train']['y'][i] < 0.5: self.data['train']['sample_weight'][i] *= backgroundReweight else: self.data['train']['sample_weight'][i] *= signalReweight else: print("DO NOT re-weight signals by:", signalReweight) print("...") # TRAINING learningCurve = [] if self.parameters['classifier'] == 'XGBClassifier': clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True) else: try: clf = clf.fit(**self.data['train']) except: clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y']) if 'rounds' in self.parameters and self.parameters['rounds'] > 1: for rNumber in range(self.parameters['rounds']): results = clf.predict_proba(self.data['test']['X']) auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight']) print(" round ", rNumber, " AUC=", auc1) learningCurve.append(auc1) clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y']) print("***** FIT done") # TEST try: results = clf.decision_function(self.data['test']['X']) print("***** EVALUATION on test sample done") results_train = clf.decision_function(self.data['train']['X']) print("***** EVALUATION on training sample done") print("R:", results.shape, results) results = np.c_[np.ones(results.shape[0]), results] results_train = np.c_[np.ones(results_train.shape[0]), results_train] except: results = clf.predict_proba(self.data['test']['X']) results_train = clf.predict_proba(self.data['train']['X']) # ROC curve print("calculating auc...") auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight']) auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight']) print("AUC:", auc1, " (training:", auc_training, ")") print("**** compute quantiles") qx = np.array([0.01, 0.99]) qy = np.array([0.0, 0.0]) thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0) nS = len(results) for i in range(nS): thq.Fill(results[i][1]) thq.GetQuantiles(2, qy, qx) # rescaling of SCORE to [0, 1] minProb = 2.0 maxProb = -1.0 #for i in range(len(self.data['train']['X'])): # if results_train[i][1] > maxProb: # maxProb = results_train[i][1] # if results_train[i][1] < minProb: # minProb = results_train[i][1] #for i in range(len(self.data['test']['X'])): # if results[i][1] > maxProb: # maxProb = results[i][1] # if results[i][1] < minProb: # minProb = results[i][1] minProb = qy[0] maxProb = qy[1] delta = maxProb-minProb minProb -= delta * 0.01 maxProb += delta * 0.10 useSqrt = False # fill TRAINING SCORE histogram (class probability) h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0) h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0) for i in range(len(self.data['train']['X'])): result = (results_train[i][1]-minProb)/(maxProb-minProb)