示例#1
0
def custom_metric():
    from catboost import CatBoostClassifier, Pool

    train_data = [[0, 3],
                  [4, 1],
                  [8, 1],
                  [9, 1]]

    train_labels = [0, 0, 1, 1]

    eval_data = [[2, 1],
                 [3, 1],
                 [9, 0],
                 [5, 3]]

    eval_labels = [0, 1, 1, 0]

    eval_dataset = Pool(eval_data,
                        eval_labels)

    model = CatBoostClassifier(learning_rate=0.03,
                               custom_metric=['Logloss',
                                              'AUC:hints=skip_train~false'])

    model.fit(train_data,
              train_labels,
              eval_set=eval_dataset,
              verbose=False)

    print(model.get_best_score())
    def catboost_hyperparams(**dict_):
        params = param_adjust_dtypes(prior_params, pds_dtypes, dict_)
        # Model definition
        model = CatBoostClassifier(**params)
        # Fitting
        model.fit(train_set, eval_set=val_set, use_best_model=True)

        return np.max(model.get_best_score()["validation"]["AUC"])
def hyperopt_ctb_scoreCV_manual(params):
    global X_hyper
    global y_hyper
    global global_best_model

    for key in catboost_space:
        print(f'       {key} {params[key]}')

    clf = CatBoostClassifier(**params)
    skf = StratifiedKFold(n_splits=3)

    cross_val_result = {'estimator': [], 'test_score': []}
    for i, (train_ind, val_ind) in enumerate(skf.split(X_hyper, y_hyper)):
        train_set = Pool(data=X_hyper.loc[train_ind],
                         label=y_hyper[train_ind],
                         cat_features=X_hyper.columns)
        val_set = Pool(data=X_hyper.loc[val_ind],
                       label=y_hyper[val_ind],
                       cat_features=X_hyper.columns)

        clf.fit(X=train_set, eval_set=val_set, use_best_model=True)
        cross_val_result['estimator'].append(clf)
        cross_val_result['test_score'].append(
            clf.get_best_score()['validation']['AUC'])

    # current_score = clf.get_best_score()['validation']['AUC']
    current_score = np.mean(cross_val_result['test_score'])

    if current_score > global_best_model['AUC']:
        global_best_model['AUC'] = current_score
        global_best_model['model'] = cross_val_result['estimator']
        print(f'new best AUC = {current_score}')

    result = {
        'loss': -current_score,
        'status': STATUS_OK,

        # -- store other results like this
        'eval_time': time.time(),
        'other_stuff': {
            'type': None,
            'value': [0, 1, 2]
        },
        # 'model': clf,
        # -- attachments are handled differently
        'attachments': {
            'attachments': 'attachments'
        }
    }

    return result
示例#4
0
    def fit_catboost(self, X, y, X_val, y_val):
        logging.info('- Fit catboost model')
        model = CatBoostClassifier(iterations=10000, eval_metric='AUC')
        model.fit(X,
                  y,
                  eval_set=(X_val, y_val),
                  early_stopping_rounds=50,
                  verbose=100)

        best_score = model.get_best_score()['validation']['AUC']
        best_iteration = model.get_best_iteration()
        self.models.append({
            'model': model,
            'best_score': best_score,
            'best_iteration': best_iteration
        })
        logging.info('Best score = {:.2%}, in {} iterations'.format(
            best_score, best_iteration))
示例#5
0
def fun_catboost(X, y, X_train, X_validation, y_train, y_validation, target):

    #Creating a training set for modeling and validation set to check model performance
    #X = df_train.drop(['Segmentation', 'Gender','Ever_Married', 'Work_Experience','Family_Size','Var_1'], axis=1)

    #categorical_features_indices = np.where(df_train.dtypes != np.float)[0]
    categorical_features_indices = list(range(len(X_train.columns)))
    categorical_features_indices

    #importing library and building model
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(iterations=5,
                               depth=3,
                               learning_rate=0.1,
                               loss_function='MultiClass',
                               eval_metric='Accuracy')
    model.fit(X_train,
              y_train,
              eval_set=(X_validation, y_validation),
              plot=True)

    predictions = model.predict(df_test)

    model.get_feature_importance(type="FeatureImportance")
    from catboost import Pool, CatBoostClassifier
    from catboost.utils import get_confusion_matrix

    train_label = ["A", "B", "C", "D"]
    cm = get_confusion_matrix(model, Pool(X_validation, y_validation))
    print(cm)
    print(model.get_best_score())

    submission = pd.DataFrame()
    submission['ID'] = df_test['ID']
    submission[target] = predictions
    return categorical_features_indices, model, submission, predictions
eval_pool = Pool(X_test, y_test)


# load model
model = CatBoostClassifier()
model.load_model('models/catboost_model_4.dump')


# Feature Importance: Know which feature contributed the most
feature_importances = model.get_feature_importance(train_pool)
feature_names = pd.DataFrame(X_train).columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

print('\n\n\n')
print(model.get_best_score())
print(model.get_params())


# Validation Prediction
probabilities = model.predict(eval_pool)
# print(probabilities)
pd.DataFrame(probabilities).to_csv('validation-scores/val-scores-3.csv')


# TEST VALUES

# preped_test_values = np.array(pd.read_csv('preped/preped_test_&_featured.csv'))

# eval_dataset = Pool(test_values)
# test_prediction = model.predict(preped_test_values)
def search_CatBoost_parameters(config: dict,
                               train_dataset: MusicDataset,
                               val_dataset: MusicDataset = None,
                               internal_cv=False):
    """
    Fit a CatBoostClassifier using train and validation set
    Returns:
        - a list of the names of the parameters
        - a list of tried parameter configurations
        - a list of corresponding results
    """
    # Get parameters
    if (type(config[_n_iterations_key]) == list):
        iterations = np.arange(config[_n_iterations_key][0],
                               config[_n_iterations_key][1],
                               config[_n_iterations_key][2])
    else:
        iterations = config[_n_iterations_key]

    if (type(config[_learning_rate_key]) == list):
        learning_rates = np.arange(config[_learning_rate_key][0],
                                   config[_learning_rate_key][1],
                                   config[_learning_rate_key][2])
    else:
        learning_rates = config[_learning_rate_key]

    loss_function = config.get("loss_function", "CrossEntropy")
    parameter_names = []
    parameter_sets = []
    results = []

    # Get data
    _, X_train, y_train = train_dataset.get_whole_dataset_as_pd()
    if (val_dataset != None):
        _, X_val, y_val = val_dataset.get_whole_dataset_as_pd()

    # GPU
    if (torch.cuda.is_available()):
        task_type = 'GPU'
        devices = str(torch.cuda.current_device())
    else:
        task_type = 'CPU'
        devices = None

    if (not internal_cv):
        # No internal cross validation during training
        for i_it, it in enumerate(iterations):
            for i_lr, lr in enumerate(learning_rates):
                model = CatBoostClassifier(iterations=it,
                                           learning_rate=lr,
                                           loss_function=loss_function,
                                           task_type=task_type,
                                           devices=devices,
                                           custom_metric=['Accuracy'])
                model.fit(X_train,
                          y_train,
                          eval_set=(X_val, y_val),
                          verbose=10)
                params = model.get_params()
                parameter_names = list(params.keys())
                parameter_sets.append(list(params.values()))
                best_score = model.get_best_score()
                results.append(best_score['validation']['Accuracy'])
                best_iter = model.get_best_iteration()
                print("Best iteration: " + str(best_iter))
    else:
        # Use catboost cross validation procedure
        params = {}
        params['loss_function'] = loss_function
        params['iterations'] = iterations
        params['custom_metric'] = 'Accuracy'
        params['task_type'] = task_type
        params['devices'] = devices

        best_value = 0.0
        best_iter = 0
        for i_lr, lr in enumerate(learning_rates):
            params['learning_rate'] = lr
            cv_data = cv(params=params,
                         pool=Pool(X_train, label=y_train),
                         fold_count=5,
                         shuffle=True,
                         partition_random_seed=0,
                         plot=True,
                         stratified=False,
                         verbose=50)
            res_value = np.max(cv_data['test-Accuracy-mean'])
            res_iter = np.argmax(cv_data['test-Accuracy-mean'])
            params['best_iteration'] = res_iter

            print(
                f"Best iteration for lr {lr}: {res_iter} with val accuracy {res_value}"
            )

            results.append(res_value)
            parameter_sets.append(list(params.values()))
            parameter_names = list(params.keys())

            # Remove entry from dict since it is used as input for cv again
            params.pop('best_iteration')

    return parameter_names, parameter_sets, results
示例#8
0
train_pool = Pool(X_train,
                  y_train.astype(int),
                  cat_features=categorical_features_step1)
validate_pool = Pool(X_val,
                     y_val.astype(int),
                     cat_features=categorical_features_step1)

model1.fit(
    train_pool,
    eval_set=validate_pool,
    #     logging_level='Verbose',  # you can uncomment this for text output
    plot=True)
# -

model1.get_best_score()

# +
# Plot non-normalized confusion matrix
np.set_printoptions(precision=2)
predict_val = model1.predict(X_val)

plot_confusion_matrix(y_val,
                      predict_val,
                      classes=np.array([True, False]),
                      title='Confusion matrix',
                      normalize=False)
# -

# # Beta calibration
#
示例#9
0
splits = folds.split(train, train['label'])
score_csv = []
for i, (tr_idx, val_idx) in enumerate(splits):
    X_tr, X_vl = train[features].iloc[tr_idx], train[features].iloc[val_idx]
    y_tr, y_vl = train['label'].iloc[tr_idx], train['label'].iloc[val_idx]
    clf = CatBoostClassifier(**cat_params)
    clf.fit(
    X_tr, y_tr, cat_features=categorical_features,
    eval_set=(X_vl, y_vl), verbose=100, plot=True)

    y_pred_valid = clf.predict_proba(X_vl)[:, 1]
    cat_oof_train[val_idx] = y_pred_valid.reshape(-1, 1)
    cat_oof_test += clf.predict_proba(test[features])[:, 1].reshape(-1, 1) / folds.n_splits


    score_csv.append(clf.get_best_score()['validation']['F1'])

  

    del X_tr, X_vl, y_tr, y_vl
    # Features imp
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.get_feature_importance()
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    gc.collect()

score = np.mean(score_csv)
示例#10
0
'''

model = CatBoostClassifier(**model_params, )

# model = CatBoostClassifier(**model_params, verbose=False)

model.fit(train_data, eval_set=test_data, verbose=False, plot=False)

model.save_model(f"{project_dir}/model/model.cbm")

builtin_metrics = model.eval_metrics(train_data,
                                     metrics=['Logloss', 'AUC', 'F1', 'PRAUC'])

# write results

hold_out_score = model.get_best_score()

# write_eval_summary_file(cv_scores, hold_out_score)

predict_probas = model.predict_proba(test_data)

test_data_metrics = calculate_metrics(build_spec['standard_metrics'],
                                      build_spec['custom_metrics'], y_test,
                                      predict_probas[:, 1])

write_eval_summary_file(cv_scores, test_data_metrics)

predict_probas = pd.DataFrame(predict_probas, columns=['False', 'True'])
predict_probas.to_csv(f"{project_dir}/model/test_data_prediction.csv")

shap_values = model.get_feature_importance(data=test_data,
def gridsearch_early_stopping(cv,
                              X,
                              y,
                              folds,
                              grid,
                              cat_features=None,
                              save=None):
    '''
    Perform grid search with early stopping across folds specified by index 
    
    Parameters
    -----------
    cv: cross validation
    X: DataFrame or Numpy array
    y: DataFrame or Numpy array
    fold: list of fold indexes
    grid: parameter grid
    save:   string, excluding file extension (default=None)
            saves results_df for each fold to folder '../../data/interim'
    '''

    if np.unique(y).size <= 2:
        loss_function = 'Logloss'
    else:
        loss_function = 'MultiClass'

    # generate data folds
    train_X, train_y, test_X, test_y = generate_folds(cv, X, y)

    # iterate through specified folds
    for fold in folds:
        # assign train and test pools
        test_pool = Pool(data=test_X[fold],
                         label=test_y[fold],
                         cat_features=cat_features)
        train_pool = Pool(data=train_X[fold],
                          label=train_y[fold],
                          cat_features=cat_features)

        # creating results_df dataframe
        results_df = pd.DataFrame(columns=[
            'params' + str(fold), loss_function + str(fold), 'Accuracy' +
            str(fold), 'iteration' + str(fold)
        ])

        best_score = 99999

        # iterate through parameter grid
        for params in ParameterGrid(grid):

            # create catboost classifer with parameter params
            model = CatBoostClassifier(
                cat_features=cat_features,
                early_stopping_rounds=50,
                task_type='GPU',
                custom_loss=['Accuracy'],
                iterations=3000,
                #class_weights=weights,
                **params)

            # fit model
            model.fit(train_pool, eval_set=test_pool, verbose=400)

            # append results to results_df

            print(model.get_best_score()['validation'])
            results_df = results_df.append(
                pd.DataFrame([[
                    params,
                    model.get_best_score()['validation'][loss_function],
                    model.get_best_score()['validation']['Accuracy'],
                    model.get_best_iteration()
                ]],
                             columns=[
                                 'params' + str(fold),
                                 loss_function + str(fold),
                                 'Accuracy' + str(fold),
                                 'iteration' + str(fold)
                             ]))

            # save best score and parameters
            if model.get_best_score(
            )['validation'][loss_function] < best_score:
                best_score = model.get_best_score(
                )['validation'][loss_function]
                best_grid = params

        print("Best logloss: ", best_score)
        print("Grid:", best_grid)

        save_file(results_df,
                  save + str(fold) + '.joblib',
                  dirName='../../models')
        display(results_df)