def _test_elbs(model):
    elb_pred = model.predict(elb_data)
    score = score_util.ScoreReport(elb_y, elb_pred)
    elb_cv_results.append(score.abs_99)
    print(score)
    print(f"elb yield stats: {_print_stats(elb_y)}")
    print(f"elb prediction stats: {_print_stats(elb_pred)}")
예제 #2
0
def _test_elbs(model):
    if elb_data.num_row() == 0:
        print("no test elbs found, skipping scoring")
        return

    elb_pred = model.predict(elb_data)
    score = score_util.ScoreReport(elb_y, elb_pred)
    elb_cv_results.append(score.abs_99)
    print(score)
    print(f"elb yield stats: {_print_stats(elb_y)}")
    print(f"elb prediction stats: {_print_stats(elb_pred)}")
예제 #3
0
 def _eval(pred, d: xgb.DMatrix):
     _scr = score_util.ScoreReport(d.get_label(), pred)
     return '3std_100x', int(round(_scr.abs_99 * 100, 0))
예제 #4
0
                return '3std_100x', int(round(_scr.abs_99 * 100, 0))

            print(f"training xgb model")
            model: xgb.Booster = xgb.train(
                default_xgb_params,
                train,
                num_boost_round=xgb_n_rounds,
                early_stopping_rounds=xgb_early_stopping,
                evals=eval_list,
                evals_result=eval_result,
                feval=_eval,
                verbose_eval=verbose_eval)

            predictions = model.predict(test,
                                        ntree_limit=model.best_ntree_limit)
            score = score_util.ScoreReport(test.get_label(), predictions)
            _results = grid_cv_results[i]
            _results.add(score.abs_99,
                         iteration=model.best_iteration,
                         mean=score.abs_mean,
                         std_dev=score.abs_std,
                         y_min=np.min(train.get_label()),
                         y_max=np.max(train.get_label()),
                         pred_min=np.min(predictions),
                         pred_max=np.max(predictions))

            print(f"""
GRID SEARCH RESULT: ({curr_model_training} of total: {total_model_trainings}) 
{test_run_idx + 1}, {idx + 1}, {i + 1} of {len(cv_params) - 1}, {p}
training yield stats: {_print_stats(train.get_label())}
pred yield stats: {_print_stats(predictions)}""")
        epoch = 0
        epoch_scores: List[score_util.ScoreReport] = []
        test_diff = float('inf')
        while epoch < epochs_max:
            print(f"model current estimators: {model.n_estimators}")
            n_estimators = model.n_estimators + n_threads
            print(f"setting estimators: {n_estimators}")
            model.n_estimators = n_estimators

            print("fitting model")
            model.fit(train, train_label)

            print("scoring")
            predictions = model.predict(test)
            score = score_util.ScoreReport(test_label, predictions, store_predictions=False)

            print(f"""
EPOCH RESULT: (training {curr_model_training} of total: {total_model_trainings})
epoch: {epoch}
n_estimators: {n_estimators}""")
            print(f"abs_99: {score.abs_99:.2f}")
            mem_util.print_mem_usage()

            if epoch > 0:
                test_diff = epoch_scores[epoch - 1].abs_std_3 - score.abs_std_3
                print(f"score diff: {test_diff:.2f}")

            epoch += 1
            if test_diff < train_tol and epoch >= epochs_min:
                print(f"ending early.  test diff: {test_diff:.2f}, epochs: {epoch}")
예제 #6
0
# def _run_dummies():
column_categories = categorical_util.get_categories_lookup(dtrain)
categorical_util.set_categories(dtrain, column_categories)
categorical_util.set_categories(dtest, column_categories)

dtrain_y = dtrain.pop('Dry_Yield')
dtrain = categorical_util.encode_dummies(df)
dummy_cols = dtrain.columns
print(dummy_cols)
dtrain = dtrain.to_coo()

dtest_y = dtest.pop('Dry_Yield')
dtest = pandas.get_dummies(dtest,
                           sparse=True,
                           drop_first=True,
                           dummy_na=False,
                           prefix_sep='__DUMMY__').to_sparse().to_coo()

print(f"dmatrix sizes: {dtrain.shape[1]}, {dtest.shape[1]}")

dtrain: xgb.DMatrix = xgb.DMatrix(dtrain, dtrain_y)
dtest: xgb.DMatrix = xgb.DMatrix(dtest, label=dtest_y)

model: xgb.Booster = xgb.train({'max_depth': 2}, dtrain, num_boost_round=2)
pred = model.predict(dtest)

scr = score_util.ScoreReport(dtest_y, pred)
print(scr)

# _run_dummies()
예제 #7
0
column_categories = categorical_util.encode_categories(data)
dummy_enc = categorical_util.DummyEncoder(data.columns, column_categories)

print("Fitting dummy enc")

data: np.ndarray = dummy_enc.fit_transform(data.as_matrix())

kf_outer = GroupKFold()
split = next(kf_outer.split(data, groups=df_year_id))
train_idx, test_idx = split
train, train_y = data[train_idx], data_label[train_idx]
test, test_y = data[test_idx], data_label[test_idx]

model = XGBRegressor(max_depth=5, n_estimators=100, silent=False, n_jobs=2)
model.fit(train, train_y)
scr = score_util.ScoreReport(test_y, model.predict(test))
print(scr)

model = RandomForestRegressor(verbose=99, n_estimators=50, n_jobs=2)
model.fit(train, train_y)
scr = score_util.ScoreReport(test_y, model.predict(test))
print(scr)

model = ExtraTreesRegressor(verbose=99, n_estimators=50, n_jobs=2)
model.fit(train, train_y)
scr = score_util.ScoreReport(test_y, model.predict(test))
print(scr)

model = SVR(degree=5, verbose=99)
model.fit(train, train_y)
scr = score_util.ScoreReport(test_y, model.predict(test))
예제 #8
0
def _score_cv(est: GridSearchCV, X, y):
    if est.best_estimator_ not in cv_results:
        cv_results.append(est.best_estimator_)

    predictions = est.predict(X)
    return score_util.ScoreReport(y, predictions).abs_99
예제 #9
0
def _score_grid_search(est: Pipeline, X, y):
    predictions = est.predict(X)
    _print_mem_usage()
    return score_util.ScoreReport(y, predictions).abs_99