示例#1
0
def test_catboost():
    try:
        import catboost
        from catboost.datasets import amazon
    except:
        print("Skipping test_catboost!")
        return
    import shap
    import numpy as np

    # train catboost model
    X, y = shap.datasets.boston()
    X["RAD"] = X["RAD"].astype(np.int)
    model = catboost.CatBoostRegressor(iterations=300,
                                       learning_rate=0.1,
                                       random_seed=123)
    p = catboost.Pool(X, y, cat_features=["RAD"])
    model.fit(p, verbose=False, plot=False)

    # explain the model's predictions using SHAP values
    ex = shap.TreeExplainer(model)
    shap_values = ex.shap_values(p)

    predicted = model.predict(X)

    assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-4, \
        "SHAP values don't sum to model output!"

    train_df, _ = amazon()
    ix = 100
    X_train = train_df.drop('ACTION', axis=1)[:ix]
    y_train = train_df.ACTION[:ix]
    X_val = train_df.drop('ACTION', axis=1)[ix:ix + 20]
    y_val = train_df.ACTION[ix:ix + 20]
    model = catboost.CatBoostClassifier(iterations=100,
                                        learning_rate=0.5,
                                        random_seed=12)
    model.fit(X_train,
              y_train,
              eval_set=(X_val, y_val),
              verbose=False,
              plot=False)
    shap.TreeExplainer(model)
示例#2
0
# In[21]:

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os

# # Exploratory Data Analysis

# In[22]:

# Loading data directly from CatBoost
from catboost.datasets import amazon

train, test = amazon()

# In[23]:

print("Train shape: {}, Test shape: {}".format(train.shape, test.shape))

# In[24]:

train.head(5)

# In[25]:

test.head(5)

# dataset has 9 columns, plus target (`ACTION`) for train and `id` for test.
# All these columns are categorical encoded as integers.
示例#3
0
import catboost
from catboost import CatBoostClassifier
from catboost import Pool
from catboost.utils import create_cd
from sklearn.model_selection import train_test_split

#######################
#   DATA EXPLORATION
#######################

#Set the seed
random_seed = 12345

# Read in the dataset
from catboost.datasets import amazon
(train_df, test_df) = amazon()

# Extract labels
y = train_df.ACTION
X = train_df.drop('ACTION', axis=1)

# Define categorical variables
cat_features = list(range(0, X.shape[1]))
print(cat_features)

#######################
#   BASIC MODEL
#######################

# Create a basic model with 200 trees
model = CatBoostClassifier(iterations=200)
示例#4
0
@author: gandhi
"""
import itertools
import numpy as np
import pandas as pd
import os
from catboost.datasets import amazon
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, log_loss
##Extracting the data
train_data, test_data = amazon()
train_data.head()
df = train_data
## Analysing the data
plt.figure()
plt.figure(figsize=(30, 20))
for i in range(1, 10):
    plt.subplot(5, 2, i)
    plt.hist(df[df.columns[i]])
    plt.xlabel(df.columns[i])
    plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(30, 20))
sns.heatmap(df.corr(), annot=True, cmap='viridis', linewidth=1)
plt.figure(figsize=(10, 30))
示例#5
0
def main():

    # fill NaN's with <UNK> token
    unk_token = '<UNK>'

    # retrieve dataset
    start = time.time()
    if os.path.exists('raw'):
        train = pd.read_csv('raw/train.csv')
        test = pd.read_csv('raw/test.csv')
    else:
        data, _ = amazon()  # train is the only one with labels
        os.makedirs('raw', exist_ok=True)
        train = data[:26215]  # same split they use in Scharchilev et al.
        test = data[26215:]
        train.to_csv('raw/train.csv', index=None)
        test.to_csv('raw/test.csv', index=None)
    print('time to load amazon: {}'.format(time.time() - start))

    # define columns
    label_col = 'ACTION'
    feature_col = list(train.columns)
    feature_col.remove(label_col)

    # nan rows
    train_nan_rows = train[train.isnull().any(axis=1)]
    test_nan_rows = test[test.isnull().any(axis=1)]
    print('train nan rows: {}'.format(len(train_nan_rows)))
    print('test nan rows: {}'.format(len(test_nan_rows)))

    # fit encoders and fill in NaNs with unknown token or mean value
    encoders = {}
    for col in feature_col:
        if str(train[col].dtype) == 'object':
            train[col] = train[col].fillna(unk_token)
            test[col] = test[col].fillna(unk_token)
            encoders[col] = OrdinalEncoder().fit(train[col].to_numpy().reshape(
                -1, 1))
        else:
            train[col] = train[col].fillna(int(train[col].mean()))
            test[col] = test[col].fillna(int(test[col].mean()))
    label_encoder = LabelEncoder().fit(train[label_col])

    # transform train dataframe
    new_train = train.copy()
    for col in feature_col:
        if col in encoders:
            new_train[col] = encoders[col].transform(
                new_train[col].to_numpy().reshape(-1, 1))
    new_train[label_col] = label_encoder.transform(new_train[label_col])

    # transform test dataframe
    new_test = test.copy()
    for col in feature_col:
        if col in encoders:
            new_test[col] = encoders[col].transform(
                new_test[col].to_numpy().reshape(-1, 1))
    new_test[label_col] = label_encoder.transform(new_test[label_col])

    # show difference
    print('train')
    print(train.head(5))
    print(new_train.head(5))

    print('test')
    print(test.head(5))
    print(new_test.head(5))

    # save to numpy format
    print('saving to train.npy...')
    np.save('train.npy', new_train.to_numpy())
    print('saving to test.npy...')
    np.save('test.npy', new_test.to_numpy())
    np.save('feature.npy', feature_col)