def test_rgs(self): np.random.seed(1) n_samples = 10000 test_size = 0.2 n_est = 100 max_depth = 7 lr = 0.1 X, y = make_friedman1_poly(n_samples=n_samples) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) model_palo = PaloBoost(distribution="gaussian", n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model_sklr = GradientBoostingRegressor( n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model_palo.fit(X_train, y_train) y_hat = model_palo.predict(X_test) rmse_palo = np.sqrt(np.mean((y_test - y_hat)**2)) model_sklr.fit(X_train, y_train) y_hat = model_sklr.predict(X_test) rmse_sklr = np.sqrt(np.mean((y_test - y_hat)**2)) self.assertTrue(rmse_palo < rmse_sklr)
def test_cls(self): np.random.seed(1) n_samples = 10000 test_size = 0.2 n_est = 100 max_depth = 7 lr = 0.1 X, y = make_hastie_11_2(n_samples) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) model_palo = PaloBoost(distribution="bernoulli", n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model_sklr = GradientBoostingClassifier( n_estimators=n_est, learning_rate=lr, max_depth=max_depth) model_palo.fit(X_train, y_train) y_hat = model_palo.predict_proba(X_test)[:,1] auc_palo = roc_auc_score(y_test, y_hat) model_sklr.fit(X_train, y_train) y_hat = model_sklr.predict_proba(X_test)[:,1] auc_sklr = roc_auc_score(y_test, y_hat) self.assertTrue(auc_palo > auc_sklr)
def load(self, model): # NOTE: not yet self.calibrators = model["clb"] self.calibrate = model["calibrate"] self.distribution = model["distribution"] self.estimators = [] for d in model["est"]: est = PaloBoost() est.load(d) self.estimators.append(est)
def fit(self, X, y): np.random.seed(self.random_state) n, m = X.shape idx = np.arange(n) self.estimators = [] if (self.distribution == "bernoulli" and (np.sum(y) < 3 or np.sum(y) > n - 3)): logging.error(("the target (y) needs to have " "at least one examples on each class")) return None i = 0 while i < self.n_paloboost: mask = np.full(n, True) if self.block_size is not None: n_block = int(n / self.block_size) + 1 mask_block = (np.random.rand(n_block) < self.subsample0) mask = np.repeat(mask_block, self.block_size)[:n] else: mask = (np.random.rand(n) < self.subsample0) X_i, y_i = X[mask, :], y[mask] X_j, y_j = X[~mask, :], y[~mask] if (self.distribution == "bernoulli" and (np.unique(y_i).shape[0] == 1 or np.unique(y_j).shape[0] == 1)): continue est = PaloBoost(distribution=self.distribution, learning_rate=self.learning_rate, max_depth=self.max_depth, n_estimators=self.n_estimators, subsample=self.subsample1, subsample_splts=self.subsample2, random_state=i * self.n_estimators) est.fit(X_i, y_i) self.estimators.append(est) if self.feature_importances_ is None: self.feature_importances_ = est.feature_importances_ else: self.feature_importances_ += est.feature_importances_ if (self.distribution == "bernoulli" and self.calibrate): z_j = est.predict_proba(X_j)[:, 1] clb = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") clb.fit(z_j, y_j) self.calibrators.append(clb) i += 1 self.feature_importances_ /= self.n_paloboost
def test_classification(): X, y = make_hastie_10_2(n_samples=1000) y[y < 0] = 0 n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) models = { "palobst": PaloBoost( distribution="bernoulli", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "gbm": GBM( distribution="bernoulli", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "sklearn": GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5), } print("\n") print("# Test Classification") print("-----------------------------------------------------") print(" model_name train_time predict_time auc ") print("-----------------------------------------------------") print(" {0:12} {1:12} {2:12} {3:.5f}".format("baseline", "-", "-", 0.5)) for name, model in models.items(): # Fit start = time.time() model.fit(X_train, y_train) time_fit = time.time() - start # Predict start = time.time() y_hat = model.predict_proba(X_test)[:, 1] time_pred = time.time() - start # Error auc = roc_auc_score(y_test, y_hat) print(" {0:12} {1:.5f} sec {2:.5f} sec {3:.5f}".format( name, time_fit, time_pred, auc)) print("-----------------------------------------------------") print("\n")
def test_regression(): X, y = make_friedman1(n_samples=100000, noise=5) n, m = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) models = { "palobst": PaloBoost( distribution="gaussian", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "gbm": GBM( distribution="gaussian", n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5, ), "sklearn": GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=4, subsample=0.5), } print("\n") print("# Test Regression") print("-----------------------------------------------------") print(" model_name train_time predict_time rmse ") print("-----------------------------------------------------") print(" {0:12} {1:12} {2:12} {3:.5f}".format("baseline", "-", "-", np.std(y_test))) for name, model in models.items(): # Fit start = time.time() model.fit(X_train, y_train) time_fit = time.time() - start # Predict start = time.time() y_hat = model.predict(X_test) time_pred = time.time() - start # Error rmse = np.sqrt(np.mean((y_test - y_hat)**2)) print(" {0:12} {1:.5f} sec {2:.5f} sec {3:.5f}".format( name, time_fit, time_pred, rmse)) print("-----------------------------------------------------") print("\n")
def clstask(X, y, n_estimators, learning_rate, max_depth, n_btstrp, has_missing, test_size, add_noise): models = { "0. PaloBoost": PaloBoost(distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "1. SGTB-Bonsai": GBM(distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "2. XGBoost": XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) } if not has_missing: models["3. Scikit-Learn"] = GradientBoostingClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"]) for idx in range(n_btstrp): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx) if add_noise: n_train = y_train.shape[0] mask = np.random.rand(n_train) < 0.2 # 20% y_train[mask] = 1 - y_train[mask] # flip df = utils.get_cls_perf(models, X_train, y_train, X_test, y_test, n_estimators) df['b_idx'] = idx perf_df = perf_df.append(df, sort=True) return perf_df
def regtask(X, y, n_estimators, learning_rate, max_depth, n_btstrp, has_missing, test_size): models = { "0. PaloBoost": PaloBoost(distribution="gaussian", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "1. SGTB-Bonsai": GBM(distribution="gaussian", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7), "2. XGBoost": XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) } if not has_missing: models["3. Scikit-Learn"] = GradientBoostingRegressor( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) perf_df = pd.DataFrame(columns=["model", "value", "n_est", "b_idx"]) for idx in range(n_btstrp): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx) df = utils.get_reg_perf(models, X_train, y_train, X_test, y_test, n_estimators) df['b_idx'] = idx perf_df = perf_df.append(df, sort=True) return perf_df
def run_aux(learning_rate, max_depth, n_estimators=200): X, y = get_friedman() model = PaloBoost(distribution="gaussian", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=0.7) model.fit(X, y) prune_df = pd.DataFrame(model.get_prune_stats()) prune_df.columns = ["iteration", "nodes_pre", "nodes_post"] lr_df = pd.DataFrame(model.get_lr_stats()) lr_df.columns = ["iteration", "lr"] prune_df.to_csv("results/prune_{}_{}.csv".format(learning_rate, max_depth), index=False) lr_df.to_csv("results/lr_{}_{}.csv".format(learning_rate, max_depth), index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument("outfile", help="filename for performance (csv)") parser.add_argument("-n", type=int, default=200, help="number of estimators") parser.add_argument("-lr", type=float, default=1.0, help="learning rate") parser.add_argument("-sub", type=float, default=0.7, help="subsample rate") parser.add_argument("-depth", type=int, default=5, help="subsample rate") args = parser.parse_args() # Parameters n_estimators = args.n learning_rate = args.lr # 1.0, 0.5, 0.1 test_size = ( 0.7 ) # 30% training, 70% test - to highlight the overfitting aspect of the models subsample = args.sub max_depth = args.depth data = pd.read_csv("data/featureSet3_48.csv") outcomes = pd.read_csv("data/outcomes-a.txt") outcomes = outcomes[["RecordID", "In-hospital_death"]] data = pd.merge(data, outcomes, how="inner", on="RecordID") col_names = data.columns col_names_x = [ cname for cname in col_names if cname not in ["RecordID", "Length_of_stay", "In-hospital_death"] ] X = pp.simple_pp(data[col_names_x]).values y = data["In-hospital_death"].values print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y))) models = { "0. PaloBoost ": PaloBoost( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "1. SGTB-Bonsai": GBM( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "2. XGBoost ": XGBClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), } boostPerf = pd.DataFrame(columns=[ "0. PaloBoost ", "1. SGTB-Bonsai", "2. XGBoost ", "nEst", "idx", ]) for idx in range(10): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx) perf_df = evalutils.get_cls_perf(models, X_train, y_train, X_test, y_test, n_estimators) perf_df["idx"] = idx boostPerf = boostPerf.append(perf_df) # store it to the file boostPerf.to_csv( (args.outfile + "_{0}_{1}_{2}_{3}.csv".format( n_estimators, learning_rate, max_depth, subsample)), index=False, ) # spit out the highest max for each class tmpDF = boostPerf.groupby(["idx"]).max() print(tmpDF.mean())
def main(): parser = argparse.ArgumentParser() parser.add_argument("outfile", help="filename for performance (csv)") parser.add_argument("-n", type=int, default=200, help="number of estimators") parser.add_argument("-lr", type=float, default=1.0, help="learning rate") parser.add_argument("-sub", type=float, default=0.7, help="subsample rate") parser.add_argument("-depth", type=int, default=5, help="subsample rate") args = parser.parse_args() # Parameters n_estimators = args.n learning_rate = args.lr # 1.0, 0.5, 0.1 test_size = ( 0.7 ) # 30% training, 70% test - to highlight the overfitting aspect of the models subsample = args.sub max_depth = args.depth data = pd.read_csv("data/6Hr-data.csv") y = data["ca"].values X = pp.simple_pp(data.drop(columns="ca")).values print("- Avg(y): {}, Std(y): {}".format(np.mean(y), np.std(y))) models = { "0. PaloBoost ": PaloBoost( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "1. SGTB-Bonsai": GBM( distribution="bernoulli", n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "2. XGBoost ": XGBClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), "3. Scikit-Learn ": GradientBoostingClassifier( n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, ), } boostPerf = pd.DataFrame( columns=[ "0. PaloBoost ", "1. SGTB-Bonsai", "2. XGBoost ", "3. Scikit-Learn ", "nEst", "idx", ] ) for idx in range(10): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=idx ) perf_df = eval_utils.get_cls_perf( models, X_train, y_train, X_test, y_test, n_estimators ) perf_df["idx"] = idx boostPerf = boostPerf.append(perf_df) # store it to the file boostPerf.to_csv( ( args.outfile + "_{0}_{1}_{2}_{3}.csv".format( n_estimators, learning_rate, max_depth, subsample ) ), index=False, ) # spit out the highest max for each class tmpDF = boostPerf.groupby(["idx"]).max() print(tmpDF.mean())