Exemplo n.º 1
0
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(
        FIMP_PATH,
        np.array(
            model.get_feature_importance(
                np.ones(pool.num_col(), dtype=int),
                0,
                cat_features=pool.get_cat_feature_indices(),
                fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
Exemplo n.º 2
0
def fun_catboost(X, y, X_train, X_validation, y_train, y_validation, target):

    #Creating a training set for modeling and validation set to check model performance
    #X = df_train.drop(['Segmentation', 'Gender','Ever_Married', 'Work_Experience','Family_Size','Var_1'], axis=1)

    #categorical_features_indices = np.where(df_train.dtypes != np.float)[0]
    categorical_features_indices = list(range(len(X_train.columns)))
    categorical_features_indices

    #importing library and building model
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(iterations=5,
                               depth=3,
                               learning_rate=0.1,
                               loss_function='MultiClass',
                               eval_metric='Accuracy')
    model.fit(X_train,
              y_train,
              eval_set=(X_validation, y_validation),
              plot=True)

    predictions = model.predict(df_test)

    model.get_feature_importance(type="FeatureImportance")
    from catboost import Pool, CatBoostClassifier
    from catboost.utils import get_confusion_matrix

    train_label = ["A", "B", "C", "D"]
    cm = get_confusion_matrix(model, Pool(X_validation, y_validation))
    print(cm)
    print(model.get_best_score())

    submission = pd.DataFrame()
    submission['ID'] = df_test['ID']
    submission[target] = predictions
    return categorical_features_indices, model, submission, predictions
Exemplo n.º 3
0
class CatBoost(BaseModel):
    '''
    Wrapper class of LightGBM.
    self.core contains Booster.
    '''
    @timer
    def __init__(self, config):
        self.config = config

    @timer
    def train(self,
              X_train,
              y_train,
              X_val=None,
              y_val=None,
              params=None,
              num_boost_round=100,
              early_stopping_rounds=None,
              fold=0):

        self.core = CatBoostClassifier(
            # **self.config.params,
            **params,
            num_boost_round=num_boost_round)
        self.core.fit(
            X=X_train,
            y=y_train,
            eval_set=(X_val, y_val),
            # verbose=True,
            early_stopping_rounds=early_stopping_rounds)
        return self

    @timer
    def predict(self, X_test):
        y_test = self.core.predict_proba(X_test)[:, 1]
        return y_test

    @property
    def feature_importance(self):
        return self.core.get_feature_importance()

    @property
    def best_iteration(self):
        return self.core.get_best_iteration()

    @property
    def evals_result(self):
        return self.core.get_evals_result()
Exemplo n.º 4
0
class CatBoost:
    _verbose = 200
    _train_dir = DATA_CACHE_DIR
    _is_gpu_available = get_gpu_device_count()
    _task_type = "GPU" if _is_gpu_available > 0 else None
    _devices = "GPU" if _is_gpu_available > 0 else None

    def __init__(self, model_id, num_input_features, num_output_classes,
                 model_save_path, **aux_params):
        self.model = CatBoostClassifier(loss_function="MultiClass",
                                        task_type=self._task_type,
                                        devices=self._devices,
                                        train_dir=self._train_dir,
                                        random_seed=SEED)
        self.model.set_params(**aux_params)
        self.model_id = model_id

        path = f"{model_save_path}/{model_id}"
        os.makedirs(path, exist_ok=True)
        self.model_path = path
        self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME)

    def load(self):
        self.model.load_model(self.modelfile_save_path)

    def save(self):
        self.model.save_model(self.modelfile_save_path)

    def fit(self, X_train, y_train, X_valid, y_valid):
        self.model.fit(Pool(X_train, y_train),
                       eval_set=(X_valid, y_valid),
                       use_best_model=True,
                       verbose=self._verbose)
        self.save()

    def predict(self, X, load=False):
        if load:
            self.load()
        return self.model.predict_proba(X)

    def explain(self, X_train, y_train, features, classes):
        importances = self.model.get_feature_importance(
            data=Pool(X_train, y_train))
        plot_importance(importances, features, self.model_path, self.model_id)
Exemplo n.º 5
0
    def train_all_save_catboost(self, X, y, categorical_features_indices):
        """train whole data and save the training to be use later in new predictions"""
        model = CatBoostClassifier(loss_function='MultiClass',
                                   eval_metric='TotalF1',
                                   random_seed=42,
                                   leaf_estimation_method='Newton')
        cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                     model.get_params())
        print("precise validation accuracy score:{}".format(np.max(cv_data)))
        model.fit(X, y, cat_features=categorical_features_indices)

        #feature importance
        print(model.get_feature_importance(prettified=True))
        # train = Pool(X, y, cat_features=categorical_features_indices)
        # feature_importances = model.get_feature_importance(train)
        # feature_names = X.columns
        # for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        #     print('{}: {}'.format(name, score))

        model.save_model('catboost_model.dump')
        print("Catboost model has been saved!")
Exemplo n.º 6
0
class ModelCatboost(Model):
    # class for model Catboost

    def __init__(self, **params):
        super().__init__(**params)
        self.early_stopping = True
        self.feature_importance = []
        self.set_model()

    def set_model(self):
        # set loss function depending of binary / multi class problem
        if self.problem_type == 'regression':
            self.model = CatBoostRegressor(**self.model_params)
        else:
            self.model = CatBoostClassifier(**self.model_params)

    def fit(self, X_train, y_train):
        # train with num_rounds
        train_pool = Pool(X_train, label=y_train.astype(float))
        self.set_model()
        self.model.fit(train_pool, use_best_model=False)
        self.feature_importances_ = self.model.get_feature_importance(
            train_pool)

    def fit_early_stopping(self, X_train, y_train, X_eval, y_eval):
        # specific early stopping for Catboost
        train_pool = Pool(X_train, label=y_train.astype(float))
        eval_pool = Pool(X_eval, label=y_eval.astype(float))
        # set specific parameters for early stopping (overfitting detector with iter)
        self.params['iterations'] = MAX_ROUNDS
        self.params['od_type'] = 'iter'
        self.params['od_wait'] = PATIENCE

        self.model.fit(train_pool, eval_set=eval_pool, use_best_model=True)

        self.num_rounds = self.model.tree_count_

        self.params['iterations'] = self.num_rounds
        self.params.pop('od_type')
        self.params.pop('od_wait')
def multi_classification():
    from catboost import CatBoostClassifier
    from sklearn.metrics import accuracy_score
    
    categorical_features_indices = [0,1]

    clf = CatBoostClassifier(
        loss_function='MultiClass',
        iterations=1000,
        random_seed=42,
        logging_level='Silent'     
    )

    clf.fit(
        X_train, y_train,
        cat_features=categorical_features_indices,
        #eval_set=(X_validation, y_validation),
        #logging_level='Verbose',  
        #plot=True
    )
    # Prediction
    y_pred = clf.predict(X_test)
    
    from sklearn.metrics import classification_report, accuracy_score
    print ('\n classification report:\n', classification_report(y_test, y_pred))
    
    from sklearn.metrics import f1_score
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print ('f1-score (micro): ', f1_micro)
    print ('f1-score (macro): ', f1_macro)
    
    # Feature Importance
    feature_importances = clf.get_feature_importance(X=X_train,y=y_train, cat_features=categorical_features_indices)
    feature_names = train_data.columns
    print ('\n Feature Importance: ')
    for score, name in zip(feature_importances, feature_names):
        print('{}: {}'.format(name, score))
Exemplo n.º 8
0
 def cb_model(self, category_cols=None):
     if category_cols is None:
         category_cols = []
     category_id = []  #
     for index, value in enumerate(self.X_t.columns):
         if value in category_cols:
             category_id.append(index)
     model = CatBoostClassifier(iterations=self.rounds,
                                learning_rate=0.1,
                                cat_features=category_id,
                                loss_function='Logloss',
                                logging_level='Verbose',
                                eval_metric='AUC')
     model.fit(self.X_t,
               self.y_t,
               eval_set=(self.X_v, self.y_v),
               early_stopping_rounds=self.early_stop)
     # res = model.predict_proba(self.test)[:, 1]
     importance = model.get_feature_importance(prettified=True)  # 显示特征重要程度
     print(importance)
     if self.modelname is not None:
         model.save_model(self.modelname + '_cb.model')
     return model
def get_important_features(train_x, train_y):
    tr_x, val_x, tr_y, val_y = train_test_split(train_x,
                                                train_y,
                                                random_state=1)
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        use_best_model=True,
        eval_metric="Accuracy",
    )
    model.fit(
        tr_x,
        tr_y,
        eval_set=(val_x, val_y),
        plot=True,
    )
    importance = pd.DataFrame(model.get_feature_importance(),
                              index=train_x.columns,
                              columns=["importance"
                                       ]).sort_values("importance",
                                                      ascending=False)
    print(importance)
    return importance
Exemplo n.º 10
0
def catboost_train_evel(x_train, y_train, x_vali, y_vali, x_test, params,
                        cate_feat_idx):
    print('=============================================')
    print('catboost model training...')
    train_pool = Pool(x_train, y_train, cat_features=cate_feat_idx)
    vali_pool = Pool(x_vali, y_vali, cat_features=cate_feat_idx)
    model = CatBoostClassifier(
        **params, task_type='CPU')  # sometimes GPU slower then CPU
    model.fit(train_pool, eval_set=vali_pool)

    print('=============================================')
    print('catboost vali acc: {:06.4f}'.format(
        accuracy_score(y_vali, model.predict(x_vali))))

    print('=============================================')
    print('catboost model training parameters:')
    for k, v in params.items():
        print('{:15}: {}'.format(k, v))

    print('=============================================')
    print('catboost model predicting...')
    test_pred_result = model.predict(x_test)
    test_pred_prob = model.predict_proba(x_test)
    print(test_pred_result[:10])
    print(test_pred_prob[:10])

    print('=============================================')
    print('catboost feature importances evaluate...')
    feat_importances = model.get_feature_importance(train_pool)
    feat_names = x_train.columns
    feat_importances_df = pd.DataFrame()
    feat_importances_df['feat'] = feat_names
    feat_importances_df['score'] = feat_importances
    feat_importances_df.sort_values(['score'], ascending=False, inplace=True)
    feat_importances_df = feat_importances_df.reset_index(drop=True)
    print(feat_importances_df)
Exemplo n.º 11
0
    "Breed1",
    "Breed2",
    "Breed3",
    "Breed4",
    "Breed5",
    "Breed6",
    "Breed7",
    "Breed8",
    "Breed9",
    "Breed10",
    "Color-light",
    "Color-medium",
    "Color-dark",
    "Color-warm",
    "Color-medium",
    "Color-cold",
    "Color_feature1",
    "Color_feature2",
]

print(model.score(x_test, y_test))
plt.figure(num=None, figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k')
plt.bar(range(len(model.get_feature_importance(prettified=False))),
        model.get_feature_importance(prettified=False))
plt.title("Cat Feature Importance")
plt.xticks(range(len(model.get_feature_importance(prettified=False))),
           features,
           rotation='vertical')
plt.gcf().savefig('feature_importance_catboost.png')
plt.show()
class Classifier(object):
    attr_key_list = [
        "AGE", "AVE_ASSISTANCE", "AVE_FOUL", "AVE_SCORE", "AVE_STEALING",
        "AVE_TACKLING", "HEIGHT", "NATION", "SAVING_TIME", "SPEED", "WEIGHT",
        "YELLOW_RED_CARD_NUMBER"
    ]

    def __init__(self,
                 dataset_file_path,
                 test_dataset_file_path,
                 dataset_split_ratio=0.7,
                 train_iter=10,
                 depth=10,
                 learing_rate=0.1,
                 loss='MultiClass',
                 logging_level='Verbose'):
        self.dataset = load_json(file_path=dataset_file_path)
        dataset_num = len(self.dataset)

        self.train_set = self.dataset[:int(dataset_num * dataset_split_ratio)]
        self.validate_set = self.dataset[int(dataset_num *
                                             dataset_split_ratio):]

        self.test_set = load_json(file_path=test_dataset_file_path)
        self.train_attr_set, self.train_label_set = self._process_dataset(
            dataset=self.train_set)
        self.validate_attr_set, self.validate_label_set = self._process_dataset(
            dataset=self.validate_set)
        self.test_attr_set, self.test_label_set = self._process_dataset(
            dataset=self.test_set)
        self.model = CatBoostClassifier(iterations=train_iter,
                                        depth=depth,
                                        cat_features=[7],
                                        loss_function=loss,
                                        learning_rate=learing_rate,
                                        logging_level=logging_level)
        self.config = {}
        self.config['LEARNING_RATE'] = learing_rate
        self.config['LOSS'] = loss
        self.config['DEPTH'] = depth
        self.config['TRAIN_DATASET_COUNT'] = len(self.train_label_set)
        self.config['VALIDATE_DATASET_COUNT'] = len(self.validate_label_set)
        self.config['TEST_DATASET_COUNT'] = len(self.test_label_set)
        self.final_test_acc = 0.0
        self.feature_res = {}

    def train(self):
        self.model.fit(X=self.train_attr_set,
                       y=self.train_label_set,
                       eval_set=(self.validate_attr_set,
                                 self.validate_label_set))

    def test(self):
        res = self.model.predict(self.test_attr_set)
        # print(res)
        acc = np.sum([
            1 if res[i] == self.test_label_set[i] else 0
            for i in range(len(self.test_label_set))
        ]) / len(self.test_attr_set)
        print("accuracy is %f" % acc)
        self.final_test_acc = acc

    def feature_importance(self):
        res = self.model.get_feature_importance()

        for score, key in zip(res, Classifier.attr_key_list):
            print("%s %f" % (key, score))
            self.feature_res[key] = score

    @staticmethod
    def _process_dataset(dataset):
        attr_set = []
        label_set = []
        for sample in dataset:
            attr = []
            for key in Classifier.attr_key_list:
                attr.append(sample[key])
            label = sample['LABEL']
            attr_set.append(attr)
            label_set.append(label)
        return np.array(attr_set), np.array(label_set)

    def export_res(self, path='', file_name='exp_log.json'):
        res = {}
        res['EXP_CONFIG'] = self.config
        res['FEATURE_IMPORTANCE'] = self.feature_res
        res['FINAL_TEST_ACCURACY'] = self.final_test_acc
        export_json(file_path=os.path.join(LOG_PATH, path, file_name),
                    dict=res)
        pass
Exemplo n.º 13
0
    for e in ax.get_yticklabels()+ax.get_xticklabels():
        e.set_fontsize(6)
        e.set_color(GRAY1)
    ax.tick_params(left=False)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    offset = transforms.ScaledTranslation(0, -0.07, fig.dpi_scale_trans)
    for e in ax.get_xticklabels() + ax.xaxis.get_ticklines() + \
             [ax.spines['bottom']]:
        e.set_transform(e.get_transform() + offset)
    ax.spines['bottom'].set_bounds(0, 100)
    _ = ax.set_xlabel('Relative Importance', color=GRAY4, fontsize=7)

# PAGE 354. FIGURE 10.6. Predictor variable importance spectrum for the spam
#           data. The variable names are written on the vertical axis.
plot_relative_feature_importance(np.array(cb_clf.get_feature_importance()))
plt.tight_layout()
plt.savefig('../figures/spam_feature_importance.pdf', dpi=300)
plt.show()

# Partial dependence 

def plot_partial_dependence(ax, feature):
    n = features.index(feature)
    X_tmp = X.copy()
    vals = np.unique(np.percentile(X_tmp[:, n], np.linspace(5, 95, 100)))
    result = []
    for i in range(vals.shape[0]):
        X_tmp[:, n] = vals[i]
        pr = np.mean(cb_clf.predict_proba(X_tmp), axis=0)
        result.append(np.log(pr[1]/pr[0]))
    def train(self, feature_names):
        """
        Input:
            feature_names: directionary of features' names
        Output:
            validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict")
        """
        # Initialize parameters
        validity = None
        model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version())
        Path.mkdir(model_path, exist_ok=True, parents=True)
        feature_importance = pd.DataFrame()
        START_FOLD = 0
        if get_back_training():
            START_FOLD = len(list(model_path.glob('**/*.model')))
        END_FOLD = 5
        if train_one_round():
            START_FOLD = 0
            END_FOLD = 1
        if START_FOLD == END_FOLD:
            return None

        # Process for each fold
        for fold in range(START_FOLD, END_FOLD):
            log_path = Path(__file__).absolute().parents[2] / "log" / "train" / str(get_version()) / str("fold{}".format(fold))
            Path.mkdir(log_path, exist_ok=True, parents=True)

            # Measure start time of the classification of this fold
            start = time.time()
            getLogger(get_version()).info("\t >> {} folds start".format(fold))
            send_message("\t :cat: {} folds start".format(fold))

            # Generate dataset
            getLogger(get_version()).info("\t \t Generating datasets...")
            send_message("\t \t Generating datasets...")
            valid = "valid{}".format(str(fold))
            trn_x = super().get_feature_df(feature_names, valid, "train")
            val_x = super().get_feature_df(feature_names, valid, "validate")
            trn_x.set_index("MachineIdentifier", inplace=True)
            val_x.set_index("MachineIdentifier", inplace=True)
            trn_y = trn_x["HasDetections"].astype(np.int8)
            val_y = val_x["HasDetections"].astype(np.int8)
            getLogger(get_version()).info("\t \t Datasets were generated.")
            send_message("\t \t Datasets were generated.")

            # Initialize variables for scoring
            if validity is None:
                validity = pd.DataFrame()
                validity["HasDetections"] = pd.concat([trn_y, val_y])
                validity["Predict"] = 0

            # Delete needless features
            del trn_x["HasDetections"], val_x["HasDetections"]

            # Classify
            clf = CatBoostClassifier(iterations=self.params["iterations"],
                                     verbose=self.params["verbose"],
                                     early_stopping_rounds=self.params["early_stopping_rounds"],
                                     random_seed=self.params["random_seed"],
                                     max_depth=self.params["max_depth"],
                                     loss_function=self.params["loss_function"],
                                     custom_metric=self.params["custom_metric"],
                                     eval_metric=self.params["eval_metric"],
                                     rsm=self.params["rsm"],
                                     train_dir=str(log_path))
            clf.fit(trn_x.values, trn_y.values,
                    eval_set=(val_x.values, val_y.values))

            for train_or_valid, metrics in clf.best_score_.items():
                for metric, score in metrics.items():
                    getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score))
                    send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score))
            validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict_proba(val_x.values)[:, 1]

            # Calculate feature importance per fold
            if fold == 0:
                feature_importance["feature"] = trn_x.columns
            feature_importance["fold{}".format(fold)] = clf.get_feature_importance()

            # Measure finish time of the classification of this fold
            elapsed_time = int(time.time() - start)
            minutes, sec = divmod(elapsed_time, 60)
            hour, minutes = divmod(minutes, 60)
            getLogger(get_version()).info(
                "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))
            send_message("\t :cat: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec))

            # Post-process this fold
            clf.save_model(str(model_path / "valid{}.model".format(fold)))

        # Output CV score
        validity = output_cv(validity, ":cat:")

        # Save importance
        directory_path = Path(__file__).absolute().parents[2] / "importance"
        save_feature_importance(feature_importance, directory_path)

        # Post-process the training
        del feature_importance
        gc.collect()

        return validity
Exemplo n.º 15
0
    'early_stopping_rounds': 100,
    'iterations': 1000,
    'verbose': 20,
    'random_seed': 1031
}

# 上記のパラメータでモデルを学習する
clf = CatBoostClassifier(**params)

# パラメータをハッシュ化してファイル名に投げる
hs = hashlib.md5(str(params).encode()).hexdigest()

clf.fit(trains, eval_set=valids, use_best_model=True, plot=True)

# feature importance
feature_importances = clf.get_feature_importance(trains)
feature_names = X_train.columns
with open(f"./output/importance_catb_{hs}.csv",
          "w",
          newline="",
          encoding="utf-8") as f:
    f.write("feature,importance\n")
    for score, name in sorted(zip(feature_importances, feature_names),
                              reverse=True):
        f.write(f'{name},{score}\n')
        print(f'{name}: {score}')

# テストデータを予測する
y_pred_proba = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)
Exemplo n.º 16
0
def kfold_catboost(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting CatBoost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    
    train_df = train_df.replace(np.inf, 0)
    test_df = test_df.replace(np.inf, 0)
    train_df = train_df.fillna(0)
    test_df = test_df.fillna(0)
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        catboost_params = {
            'iterations': 10000,
            'verbose': 1000,
            'learning_rate': 0.05,
            'depth': 8,
            'l2_leaf_reg': 40,
            'bootstrap_type': 'Bernoulli',
            'subsample': 0.7,
            'scale_pos_weight': 5,
            'eval_metric': 'AUC',
            'od_type': 'Iter',
            'od_wait': 200,
            'allow_writing_files': False
        }
        clf = CatBoostClassifier(**catboost_params)
        clf.fit(train_x, train_y,eval_set=[(train_x,train_y),(valid_x, valid_y)],
            use_best_model=True, verbose=1000, early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        sub_pred = clf.predict_proba(test_df[feats])[:, 1]
        sub_preds += sub_pred / folds.n_splits
        auc_score = roc_auc_score(valid_y, oof_preds[valid_idx])
    
        fold_importance_df = pd.DataFrame(clf.get_feature_importance(prettified=True),columns=["feature","importance"])
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_score))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    write_to_csv(train_df,oof_preds,"oof_catboost.csv")
    write_to_csv(test_df,sub_preds,"submission_catboost.csv")
    # Write submission file and plot feature importance
#     if not debug:
#         test_df['TARGET'] = sub_preds
#         test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    temp = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
    print("no. of contributing features: %d" % (len(temp[temp["importance"]>0])))
    display_importances(feature_importance_df)
    feature_importance_df.groupby("feature").mean().sort_values("importance",ascending=False)["importance"].to_csv("feature_importance.csv")
Exemplo n.º 17
0
def test_interaction_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction')))
    return local_canonical_file(FIMP_PATH)

cat_features = categorical_features_indices


# In[129]:


categorical_features_indices


# In[131]:


feature_score = pd.DataFrame(list(zip(one_hot.dtypes.index, 
                model.get_feature_importance(Pool(one_hot, label=Y, cat_features=categorical_features_indices)))),
                columns=['Feature','Score'])
feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')


# In[152]:


plt.rcParams["figure.figsize"] = (502,7)
ax = feature_score.plot('Feature', 'Score', kind='bar', color='r')
ax.set_title("Catboost Feature Importance Ranking", fontsize = 6)
ax.set_xlabel('')
rects = ax.patches
labels = feature_score['Score'].round(2)
for rect, label in zip(rects, labels):
    height = rect.get_height()
    y_val = y.iloc[valid_index]

    model = CatBoostClassifier(**params)

    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), plot=True)

    y_pred_valid = model.predict(X_val)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_val, y_pred_valid)}")

    score += roc_auc_score(y_val, y_pred_valid) / NFOLDS
    y_preds += model.predict_proba(X_test)[:, 1] / NFOLDS

    fold_importance_df = pd.DataFrame()
    fold_importance_df['feature'] = columns
    fold_importance_df['importance'] = model.get_feature_importance()
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0)

    del model, X_val, X_tr, y_val, y_tr
    gc.collect()

print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

# In[17]:

sub = pd.read_csv('sample_submission.csv')

# In[18]:
Exemplo n.º 20
0
sigmoid = lambda x: 1 / (1 + exp(-x))

probabilities = sigmoid(raw_pred)

print(probabilities)


# In[40]:


predictions_gen = model.staged_predict_proba(
    data=X_validation,
    ntree_start=0, 
    ntree_end=5, 
    eval_period=1
)
try:
    for iteration, predictions in enumerate(predictions_gen):
        print('Iteration ' + str(iteration) + ', predictions:')
        print(predictions)
except Exception:
    pass


# In[41]:


model.get_feature_importance(prettified=True)

Exemplo n.º 21
0
    def model_catboost(self, X, y, X_train, y_train, X_test, y_test,
                       categorical_features_indices, target, file):
        print("Processing CATBOOST....")

        # Adicione esto: inicio
        train_pool = Pool(X_train,
                          y_train,
                          cat_features=categorical_features_indices)
        validate_pool = Pool(X_test,
                             y_test,
                             cat_features=categorical_features_indices)
        # fin

        #         model=CatBoostClassifier(loss_function='MultiClass',use_best_model=True, random_seed=42)#, class_weights=[1,2,3,4,5,6,7,8,9,10,11])
        model = CatBoostClassifier(loss_function='MultiClass',
                                   eval_metric='TotalF1',
                                   use_best_model=True,
                                   random_seed=42,
                                   leaf_estimation_method='Newton')

        model.fit(train_pool,
                  eval_set=validate_pool,
                  use_best_model=True,
                  verbose=50,
                  plot=False,
                  early_stopping_rounds=100)

        # cross-validation
        cv_params = model.get_params()
        cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                     cv_params,
                     fold_count=10,
                     plot=False)
        print('Precise validation accuracy score: {}'.format(
            np.max(cv_data)))  # ['TotalF1']
        # fin

        print("PRIMER prediccion")
        print()
        print(model)
        # make predictions
        expected_y = y_test
        predicted_y = model.predict(X_test)
        # summarize the fit of the model
        print()
        print(metrics.classification_report(expected_y, predicted_y))
        print()
        print(metrics.confusion_matrix(expected_y, predicted_y))

        print("SEGUNDO prediccion")
        print(model.best_iteration_, model.best_score_)
        print(model.evals_result_['validation']['MultiClass'][-10:])

        # prediction
        pred = model.predict(X_test)
        print("PREDICT")
        print(pred)

        print("print dataframe predictions:")
        cm = pd.DataFrame()
        #         cm['DAMAGE'] = y_test
        cm[target] = y_test
        cm['Predict'] = model.predict(X_test)
        print(cm)

        print("SCORES")
        print(model.score(X_test, y_test))
        cm.to_csv(file)  # , index=False)
        #         cm.to_csv("catboost_prediction.csv")#, index=False)

        # confusion matrix
        print("confusion matrix:")
        #         conf_mat = get_confusion_matrix(model, Pool(X_train, y_train, cat_features=categorical_features_indices))
        conf_mat = get_confusion_matrix(
            model,
            Pool(X_test, y_test, cat_features=categorical_features_indices))
        print(conf_mat)

        # feature selection
        print(model.get_feature_importance(prettified=True))
        # feature_importances = model.get_feature_importance(train_pool)
        # feature_names = X_train.columns
        # for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        #     print('{}: {}'.format(name, score))
        ##

        return model, cv_data
Exemplo n.º 22
0
# To Pool Class (for catboost only)
pool_train=Pool(X_train, Y_train,cat_features=Pos)

# Fit the model
print('\nCatboost Optimal Fit with %d rounds...\n' % nrounds)
model_catboost.fit(X=pool_train)


# 3) Shap Importance for the features of the final model
################################################################################
# Shap methodology:
# "https://medium.com/@gabrieltseng/interpreting-complex-models-with-shap-values-1c187db6ec83"
# Catboost has already SHAP integrated
ShapImportance=model_catboost.get_feature_importance(data=pool_train,
												     type='ShapValues',
													 prettified=True,
													 verbose=False)

ShapValues=pd.DataFrame(ShapImportance[:,:-1], columns=list(X_train))

# Shap come as variation with respect to LOG Odds!!!
# We make an adaptataion to express the probability variation
ShapValues['SUMX_LO'] = ShapValues.sum(axis=1)
ShapValues['EXP_VAL_LO'] = ShapImportance[0,-1]
ShapValues['Pred_LO'] = ShapValues['EXP_VAL_LO'] + ShapValues['SUMX_LO']

ShapValues['EXP_VAL_p']=1/(1+np.exp(-ShapValues['EXP_VAL_LO']))
ShapValues['pred_p']=1/(1+np.exp(-ShapValues['Pred_LO']))
ShapValues['SUMX_p']=ShapValues['pred_p']-ShapValues['EXP_VAL_p']

cols=list(ShapValues)[0:-6]
# In[77]:

#Check the ROC AUC score on validation dataset
roc_auc_score(Y_test, y_pred)

# In[78]:

#Plot roc curve
plot_roc_curve(model, X_test, Y_test)

# In[79]:

#Check the feature importance of our model
model.get_feature_importance(data=catboost.Pool(X_test,
                                                label=Y_test,
                                                cat_features=cat_cols),
                             type='FeatureImportance',
                             prettified=True,
                             verbose=True)

# In[80]:

#Check the interaction between features in the model
model.get_feature_importance(data=catboost.Pool(X_test,
                                                label=Y_test,
                                                cat_features=cat_cols),
                             type='Interaction',
                             prettified=True,
                             verbose=True)

# In[58]:
Exemplo n.º 24
0
fpr_rf_cat, tpr_rf_cat, _ = roc_curve(y_test, y_pred_cat)
auc(fpr_rf_cat, tpr_rf_cat)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf_cat, tpr_rf_cat, label='RF')
#plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
#plt.plot(fpr_grd, tpr_grd, label='GBT')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

feature_importance = model.get_feature_importance(np.reshape(
    x_test, (30000, -1)),
                                                  y=y_test)

## Logistic Regression
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(x_train, y_train)
y_pred_logistic = logistic.predict(x_test)
logistic.score(x_test, y_test.values.reshape(-1, 1))
logistic.predict_proba(x_test)

from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred_logistic)
auc(fpr, tpr)
Exemplo n.º 25
0
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
Exemplo n.º 26
0
                                   verbose=VERBOSE,
                                   random_state=RANDOM_STATE,
                                   thread_count=N_THREADS,
                                   task_type="GPU")

        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        )
        y_pred_valid = model.predict_proba(valid_dataset)[:, 1]
        y_pred = model.predict_proba(test_dataset)[:, 1]

        fold_importance = pd.DataFrame()
        fold_importance["feature"] = model.feature_names_
        fold_importance["importance"] = model.get_feature_importance()
        fold_importance["fold"] = fold_n + 1
        feature_importance = pd.concat([feature_importance, fold_importance],
                                       axis=0)
        best_iteration = model.best_iteration_
    best_iterations.append(best_iteration)

    fold_score = roc_auc_score(y_valid, y_pred_valid)
    scores.append(fold_score)

    update_tracking(
        run_id,
        "AUC_f{}".format(fold_n + 1),
        fold_score,
        integer=False,
    )
Exemplo n.º 27
0

y_train = y_train[:,1]
y_test = y_test[:,1]

train_pool = Pool(X_train, y_train)
eval_pool = Pool(X_test, y_test)


# load model
model = CatBoostClassifier()
model.load_model('models/catboost_model_4.dump')


# Feature Importance: Know which feature contributed the most
feature_importances = model.get_feature_importance(train_pool)
feature_names = pd.DataFrame(X_train).columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

print('\n\n\n')
print(model.get_best_score())
print(model.get_params())


# Validation Prediction
probabilities = model.predict(eval_pool)
# print(probabilities)
pd.DataFrame(probabilities).to_csv('validation-scores/val-scores-3.csv')

Exemplo n.º 28
0
def test_interaction_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction')))
    return local_canonical_file(FIMP_PATH)
Exemplo n.º 29
0
model2 = CatBoostClassifier()
model2.fit(X_train2,
           y_train2,
           cat_features=categorical_features_indices2,
           eval_set=(X_test2, y_test2))

# In[156]:

print('Accuracy of CatBoost classifier on training set: {:.2f}'.format(
    model2.score(X_train2, y_train2)))
print('Accuracy of CatBoost classifier on test set: {:.2f}'.format(
    model2.score(X_test2, y_test2)))

# In[157]:

model2.get_feature_importance()

# In[158]:

X2.columns

# In[183]:

X_test2.shape

# In[184]:

y_test2.shape

# In[185]:
        pd.read_csv(
            '/Users/jacobtryba/DSI/assignments/supervised-learning-case-study/data/churn_train.csv'
        )).drop('months_as_user', axis=1)
    df_test = clean(
        pd.read_csv(
            '/Users/jacobtryba/DSI/assignments/supervised-learning-case-study/data/churn_test.csv'
        )).drop('months_as_user', axis=1)

    X_train, X_test, y_train, y_test = X_y(df_train)

    model = CatBoostClassifier(iterations=2,
                               depth=2,
                               learning_rate=1,
                               loss_function='Logloss',
                               verbose=True)
    # train the model
    model.fit(X_train, y_train)
    # make the prediction using the resulting model
    preds_class = model.predict(X_test)
    preds_proba = model.predict_proba(X_test)

    print(preds_class)
    print(preds_proba)

    print(
        sklearn.metrics.accuracy_score(y_test,
                                       preds_class,
                                       normalize=True,
                                       sample_weight=None))
    print(model.get_feature_importance())
cbc.fit(X_train, 
        y_train,
        cat_features=cat_features,
        logging_level='Verbose',
        eval_set=(X_val, y_val),
        # early_stopping_rounds=100,
        use_best_model=True,
        plot=True)

print("Count of trees in model = {}".format(cbc.tree_count_))

# Print Feature Importance

train_pool = Pool(X_train, y_train, cat_features=cat_features)
feature_importances = cbc.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

# Plotting

sns.set(font_scale=2)

def func_plot_importance(df_imp):

    sns.set(font_scale=1)
    fig = plt.figure(figsize=(3, 3), dpi=100)
    ax = sns.barplot(
        x="Importance", y="Features", data=df_imp, label="Total", color="b")
    ax.tick_params(labelcolor='k', labelsize='10', width=3)
Exemplo n.º 32
0
def model_rpt(df, config):
    print("training model...")
    numeric_features = list(config['numeric_features'].keys())
    categorical_features = list(config['categorical_features'].keys())
    target = config['target_feature']

    #rpt = open('samplefile.txt', 'w')
    rpt = open(config['output_path'] + config['output_file'] + ".txt", 'w')
    rpt.write("--- dataset summary --- \n")
    rpt.write("output_file          = " + config['output_file'] + ".csv\n")
    rpt.write("n_samples            = " + str(config['n_samples']) + "\n")
    rpt.write("n_features           = " +
              str(len(numeric_features) + len(categorical_features)) + "\n")
    rpt.write("pct_missing          = " +
              "{:.2%}".format(config['pct_missing']) + "\n")
    rpt.write("\n")
    rpt.write("--- model features --- \n")
    rpt.write("numeric_features     = " + str(numeric_features) + "\n")
    rpt.write("categorical_features = " + str(categorical_features) + "\n")
    rpt.write("\n")
    """ df_stats = summary_stats(df)
    rpt.write("--- data summary --- \n")
    rpt.write("Column \t Dtype \t Count\t N_unique\t N_null\t N_notnull\t PCT_null\t PCT_unique\t CHK_null\t CHK_unique\n")
    for index, row in fprtbl.iterrows():
        rpt.write( "%1.2f\t %1.2f\t %1.2f\n" % (row['threshold'],row['fpr'], row['tpr'] ))
    rpt.write("\n") """

    df = prep_df(df, numeric_features, categorical_features)
    X_train, X_eval = part_df(
        df[numeric_features + categorical_features + [target]], 0.2)
    X_eval, X_test = part_df(X_eval, 0.5)

    categorical_features_pos = column_index(
        X_train[numeric_features + categorical_features], categorical_features)

    # Initialize CatBoostClassifier
    model = CatBoostClassifier(100)
    # Fit model
    model.fit(X_train[numeric_features + categorical_features],
              X_train[target].values,
              plot=False,
              verbose=False,
              cat_features=categorical_features_pos,
              eval_set=(X_eval[numeric_features + categorical_features],
                        X_eval[target].values))
    # --- model predict  ---
    prob_cat_train = model.predict_proba(X_train[numeric_features +
                                                 categorical_features])[:, 1]

    prob_cat_eval = model.predict_proba(X_eval[numeric_features +
                                               categorical_features])[:, 1]

    prob_cat_test = model.predict_proba(X_test[numeric_features +
                                               categorical_features])[:, 1]

    print("(Train)")
    print("AUC Score        : %f" %
          roc_auc_score(X_train[target].values, prob_cat_train))
    print("\n")
    print("(eval)")
    print("AUC Score        : %f" %
          roc_auc_score(X_eval[target].values, prob_cat_eval))
    print("\n")
    print("(Test)")
    print("AUC Score        : %f" %
          roc_auc_score(X_test[target], prob_cat_test))
    print("\n")

    rpt.write("--- dataset performance ---\n")
    rpt.write("Train AUC Score        : %f" %
              roc_auc_score(X_train[target].values, prob_cat_train))
    rpt.write("\n")
    rpt.write("Eval  AUC Score        : %f" %
              roc_auc_score(X_eval[target].values, prob_cat_eval))
    rpt.write("\n")
    rpt.write("Test  AUC Score        : %f" %
              roc_auc_score(X_test[target], prob_cat_test))
    rpt.write("\n\n")

    fpr, tpr, thr = roc_curve(X_test[target], prob_cat_test)
    model_stat = pd.concat([
        pd.DataFrame(fpr).rename(columns={0: 'fpr'}),
        pd.DataFrame(tpr).rename(columns={0: 'tpr'}),
        pd.DataFrame(thr).rename(columns={0: 'threshold'})
    ],
                           axis=1).round(decimals=2)

    # m = model_stat.loc[model_stat['fpr'] <= 0.1]
    m = model_stat.loc[model_stat.groupby(["fpr"])["threshold"].idxmax()]

    # m1 = m.loc[model_stat['threshold'].idxmax()]
    print("--- score thresholds ---")
    print(m.loc[(m['fpr'] > 0.0) & (m['fpr'] <= 0.1)].reset_index(drop=True))
    print("\n")
    fprtbl = m.loc[(m['fpr'] > 0.0) & (m['fpr'] <= 0.1)].reset_index(drop=True)
    rpt.write("--- score thresholds ---\n")
    rpt.write("THR \t FPR \t TPR\t \n")
    for index, row in fprtbl.iterrows():
        rpt.write("%1.2f\t %1.2f\t %1.2f\n" %
                  (row['threshold'], row['fpr'], row['tpr']))
    rpt.write("\n")
    shap_values = model.get_feature_importance(Pool(
        X_test[numeric_features + categorical_features],
        label=X_test[target],
        cat_features=categorical_features_pos),
                                               type="ShapValues")

    shap_values = shap_values[:, :-1]

    vals = np.abs(shap_values).mean(0)
    feature_importance = pd.DataFrame(
        list(zip(X_train.columns, vals)),
        columns=['feature_name', 'feature_importance'])
    feature_importance.sort_values(by=['feature_importance'],
                                   ascending=False,
                                   inplace=True)
    print("--- feature importance  ---")
    print(feature_importance.reset_index(drop=True))
    rpt.write("--- feature importance  ---\n")
    for index, row in feature_importance.iterrows():
        rpt.write("%-30s\t %1.4f\n" %
                  (row['feature_name'], row['feature_importance']))

    rpt.write("\n")
    rpt.close()
    if config["s3_upload"] == "True":
        ret = upload_file_s3('txt', config)

    return model
Exemplo n.º 33
0
def test_shap_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, max_ctr_complexity=1)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='ShapValues')))
    return local_canonical_file(FIMP_PATH)