Exemplo n.º 1
0
def bayesian_optimize(clf,
                      param_grid,
                      X_train,
                      X_test,
                      y_train,
                      y_test,
                      beta=0.2,
                      threshold=0.1,
                      n_iter=20,
                      verbose=0,
                      n_jobs=1):
    """
    Wrapper for Bayesian Optimization for hyperparameter tuning.
    
    Arguments:
    clf -- input classifier. May be of sklearn class.
    param_grid -- input parameter grid, in a dictionary format.
    X_train, X_test, y_train, y_test -- X and y data.
    beta -- float, beta < 1 favors more on NPV, while beta > 1 more on specificity
    threshold -- float, Threshold for classifying positive vs. negative classes.
    n_iter -- int, number of iterations to carry out Bayesian Optimization.
    verbose -- int, verbosity controller.
    n_jobs -- int, number of processes/threads being used.
    
    Returns:
    opt_rf -- The optimized classifier.
    """

    # RandomForest + Bayesian optimization
    opt_rf = BayesSearchCV(clf,
                           param_grid,
                           n_iter=n_iter,
                           verbose=verbose,
                           scoring=make_scorer(adjusted_neg_Fbeta_score,
                                               needs_proba=True,
                                               beta=beta,
                                               threshold=threshold),
                           n_jobs=n_jobs)

    # callback handler
    def on_step(optim_result):
        score = opt_rf.best_score_
        print("\t- best score: %s" % score)
        if score >= 0.99:
            print('* Interrupting...')
            return True

    opt_rf.fit(X_train, y_train.ravel(), callback=[on_step])

    y_prob = np.asarray(opt_rf.predict_proba(X_test))
    y_test = np.asarray(y_test)
    summarize_res(opt_rf, y_prob, y_test)

    return opt_rf
Exemplo n.º 2
0
def run_shallow(data_dir: str, results_dir: str, splits: List[str],
                metric: str, n_iter: int, n_points: int, n_folds: int,
                n_jobs: int) -> None:
    """Evaluate shallow baselines on the scruples resource.

    Train shallow baseline models on the scruples resource, reading
    the dataset from DATA_DIR, and writing trained models, logs, and
    other results to RESULTS_DIR. Performance is reported for each split
    provided as an argument.
    """
    # Step 1: Manage and construct paths.

    logger.info('Creating the results directory.')

    os.makedirs(results_dir)
    model_paths = {}
    metrics_paths = collections.defaultdict(dict)
    predictions_paths = collections.defaultdict(dict)
    for baseline in baselines.resource.SHALLOW_BASELINES.keys():
        os.makedirs(os.path.join(results_dir, baseline))
        model_paths[baseline] = os.path.join(results_dir, baseline,
                                             'model.pkl')
        for split in splits:
            os.makedirs(os.path.join(results_dir, baseline, split))
            metrics_paths[baseline][split] = os.path.join(
                results_dir, baseline, split, 'metrics.json')
            predictions_paths[baseline][split] = os.path.join(
                results_dir, baseline, split, 'predictions.jsonl')

    # Step 2: Load the data.

    logger.info(f'Loading the data from {data_dir}.')

    dataset = ScruplesResource(data_dir=data_dir)

    # Step 3: Run the baselines.

    logger.info('Running the baselines.')

    for baseline, (Model, hyper_parameter_space) in tqdm.tqdm(
            baselines.resource.SHALLOW_BASELINES.items(),
            **settings.TQDM_KWARGS):
        # tune the hyper-parameters and train the model
        ids, features, labels, label_scores = dataset.train
        if hyper_parameter_space:
            model = BayesSearchCV(
                Model,
                hyper_parameter_space,
                scoring=make_scorer(score_func=METRICS[metric][1],
                                    **METRICS[metric][2]),
                n_iter=n_iter,
                n_points=n_points,
                cv=n_folds,
                n_jobs=os.cpu_count() if n_jobs == 0 else n_jobs,
                refit=True)
        else:
            model = Model
        model.fit(features, labels)

        # Step 4: Save the model.

        with open(model_paths[baseline], 'wb') as model_file:
            dill.dump(model, model_file)

        # Step 5: Run evaluation on the splits.

        for split in splits:
            ids, features, labels, label_scores = getattr(dataset, split)

            predictions = model.predict(features)
            probabilities = model.predict_proba(features)

            with open(metrics_paths[baseline][split], 'w') as metrics_file:
                json.dump(
                    {
                        key: metric(
                            y_true=labels,
                            y_pred=probabilities
                            if scorer_kwargs['needs_proba'] else predictions)
                        for key, (_, metric, scorer_kwargs) in METRICS.items()
                    }, metrics_file)

            with open(predictions_paths[baseline][split], 'w')\
                 as predictions_file:
                for id_, probs, prediction in zip(ids, probabilities,
                                                  predictions):
                    predictions_file.write(
                        json.dumps({
                            'id': id_,
                            'label': prediction.tolist(),
                            'label_scores': probs.tolist()
                        }) + '\n')
Exemplo n.º 3
0
    test_df = as_category(test_df)

    test_X = test_df.drop(['CONTACT_DATE', 'SNAP_DATE'], axis=1)

    if clf_name != 'FeaturePredictor':
        cols = list(set(test_X.columns).difference(test_X.select_dtypes(include='category').columns))
        test_X.loc[:, cols] = test_X.loc[:, cols].fillna(0).replace([np.inf, -np.inf], 0)
        for c in test_X.select_dtypes(include='category').columns:
            test_X.loc[:, c] = test_X.loc[:, c].cat.codes

    adv_auc = 0
    adv_train_x, adv_train_y, adv_test_x, adv_test_y = adversial_train_test_split(train_X.loc[:, features], train_y,
                                                                                  test_X.loc[:, features],
                                                                                  topK=1000)
    bayes_cv_tuner._fit_best_model(adv_train_x, adv_train_y)
    adv_pred_y = bayes_cv_tuner.predict_proba(adv_test_x)[:, 1]
    adv_auc = roc_auc_score(adv_test_y, adv_pred_y)
    print(f'Adversial AUC = {adv_auc} by {len(adv_test_y)} samples')

    bayes_cv_tuner._fit_best_model(train_X, train_y)
    test_y = bayes_cv_tuner.predict_proba(test_X)
    df = pd.DataFrame(test_y[:, 1])
    df.to_csv(f"submits/"
              f"{best_estimator.__class__.__name__}"
              f"_{datetime.now().strftime('%d_%H_%M')}"
              f"_{bayes_cv_tuner.best_score_:0.4f}"
              f"_{adv_auc:0.4f}.csv",
              header=None,
              index=None)
Exemplo n.º 4
0
}

# scorer
metric = make_scorer(score_func=log_loss,
                     greater_is_better=False,
                     needs_proba=True,
                     labels=train['Category'].unique())

# cv
kfold_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# bayessearch cv
bayes_tuned_pipeline = BayesSearchCV(estimator=estimator_pipeline,
                                     search_spaces=search_space,
                                     n_iter=10,
                                     scoring=metric,
                                     cv=kfold_cv,
                                     verbose=12,
                                     n_jobs=-1,
                                     refit=True)

bayes_tuned_pipeline.fit(X_train, y_train)

# Saving model using pickle
pickle.dump(bayes_tuned_pipeline, open('logistic_tuned_pipeline.pkl', 'wb'))

phat_val = bayes_tuned_pipeline.predict_proba(X_val)
log_loss(y_val, phat_val)

make_submission_file(bayes_tuned_pipeline, test, 'onsite_logistic.csv')
###### CatBoost with Tuning
cb_param_grid = {'iterations': Integer(10, 1000),
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30),
                 'scale_pos_weight':Real(0.01, 1.0, 'uniform')}
cb_bs = BayesSearchCV(cb, cb_param_grid, scoring = 'roc_auc', n_iter = 100, n_jobs = 1,
                      return_train_score = False, refit = True, optimizer_kwargs = {'base_estimator': 'GP'}, 
                      random_state = 123)

cb_bs.fit(x_train, y_train)

y_probs = cb_bs.predict_proba(x_test)
y_probs = y_probs[:, 1]
y_pred = cb_bs.predict(x_test)

print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_probs)) ### 0.903

fpr, tpr, thresholds = roc_curve(y_test, y_probs)
plot_roc_curve(fpr, tpr)

# Find the best parameters
cb_bs.best_params_
# Use the parameters to re-run the model
cb_tuned = CatBoostClassifier(iterations = 1000, depth = 8,
                 learning_rate = 0.11574, random_strength = 1e-9,
                 bagging_temperature = 1.0,
Exemplo n.º 6
0
                        'min_samples_leaf': Integer(2, 6),
                        'bootstrap': Categorical([False]),
                    },
                    n_iter=300,
                    random_state=42,
                    cv=5,
                    n_jobs=6,
                    verbose=1)

# y_metales.values.reshape(-1,1)
# Fit the random search model
opt.fit(X_train, y_train)
opt.best_params_
opt.best_score_

opt_pred = opt.predict_proba(X_new)[:, 1]
Confusion_Matrix(y_new, opt_pred, pred_prob=True)

Confusion_Matrix(np.array(pd.concat([y_test, y_new], axis=0)),
                 opt.predict_proba(pd.concat([X_test, X_new], axis=0))[:, 1],
                 pred_prob=bool)
'''
Best parameters
'''

best_parameters = {
    'GridSearchCV': {
        'bootstrap': False,
        'criterion': 'gini',
        'max_depth': 10,
        'max_features': 'auto',