예제 #1
0
 def compute_fi_gain(self):
     # TODO: fix it
     fi = self.predictor.get_score(importance_type='gain')
     fi = Series(self.group_fi(fi))
     # xgboost doesn't always return all the columns he got in fit.
     # https://www.kaggle.com/c/homesite-quote-conversion/discussion/18669
     for col in self.x_train_cols:
         if col not in fi.index:
             fi[col] = 0
     return normalize_series(Series(self.group_fi(fi)))
def run_experiment(
        model_name: str, get_data: callable, compute_permutation: bool, \
        save_results: bool, model, exp_results_path):
    X, y = get_data()
    seed(7)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=VAL_RATIO)
    preprocessing_pipeline.fit(X_train)
    X_train = preprocessing_pipeline.transform(X_train)
    X_test = preprocessing_pipeline.transform(X_test)

    print("binning data")
    num_cols = get_num_cols(X_train.dtypes)
    bin_mapper = BinMapper(max_bins=256, random_state=42)
    X_train.loc[:, num_cols] = bin_mapper.fit_transform(
        X_train.loc[:, num_cols].values)
    X_test.loc[:, num_cols] = bin_mapper.transform(X_test.loc[:,
                                                              num_cols].values)

    original_dtypes = X_train.dtypes
    model.fit(X_train, y_train)
    test_prediction = model.predict(X_test)
    if compute_permutation:
        permutation_train = model.compute_fi_permutation(X_train,
                                                         y_train).to_dict()
        permutation_test = model.compute_fi_permutation(X_test,
                                                        y_test).to_dict()
    else:
        empty_dict = Series({col: nan for col in original_dtypes})
        permutation_train = empty_dict
        permutation_test = empty_dict

    is_classification = len(unique(y)) == 2
    if is_classification:
        df = DataFrame()
        df['p'] = model.predict(X_test)
        df['y'] = y_test
        df = df[df.p.notna()]
        logloss = log_loss(df['y'], df['p'])
    else:
        logloss = nan

    fi = Series(model.compute_feature_importance(method='gain'))
    fi = normalize_series(fi).to_dict()
    results = dict(model=f"{model_name}",
                   ntrees=len(model.trees),
                   leaves=[tree.n_leaves for tree in model.trees],
                   nleaves=sum([tree.n_leaves for tree in model.trees]),
                   logloss=logloss,
                   gain=fi)

    # if save_results:
    #     DataFrame(Series(results)).T.to_csv(exp_results_path)
    print(logloss)
예제 #3
0
 def compute_fi_permutation(self, X, y):
     results = {}
     true_error = self.compute_error(X, y)
     for col in X.columns:
         permutated_x = X.copy()
         random_feature_mse = []
         for i in range(N_PERMUTATIONS):
             permute_col(permutated_x, col)
             random_feature_mse.append(self.compute_error(permutated_x, y))
         results[col] = mean(array(random_feature_mse)) - true_error
     fi = Series(self.group_fi(results))
     return normalize_series(fi)
예제 #4
0
def get_feature_importance(exp, path, loo):
    all_cols = [
        'gain', 'permutation_train', 'permutation_test', 'shap_train',
        'shap_test'
    ]
    our_cols = ['gain', 'permutation_train', 'permutation_test']
    cols = our_cols if exp.startswith('Ours') else all_cols
    fi_df = get_fi(path, cols, {col: literal_eval for col in cols})
    if loo:
        fi_df['loo'] = get_loo_fi(path, fi_df.index.tolist())
        fi_df['loo'] = normalize_series(fi_df['loo'])
    return fi_df
예제 #5
0
 def compute_fi_permutation(self, X, y):
     results = {}
     mse = mean(square(y - self.predictor.predict(X)))
     for col in X.columns:
         permutated_x = X.copy()
         random_feature_mse = []
         for i in range(N_PERMUTATIONS):
             permutated_x[col] = permutation(permutated_x[col])
             random_feature_mse.append(mean(square(y - self.predictor.predict(permutated_x))))
         results[col] = mean(array(random_feature_mse)) - mse
     fi = Series(results)
     return normalize_series(fi)
예제 #6
0
def run_experiment(
        model_name: str, get_data: callable, compute_permutation: bool, \
        save_results: bool, model, exp_results_path):
    X, y = get_data()
    seed(7)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=VAL_RATIO)
    preprocessing_pipeline.fit(X_train)
    X_train = preprocessing_pipeline.transform(X_train)
    X_test = preprocessing_pipeline.transform(X_test)
    original_dtypes = X_train.dtypes
    model.fit(X_train, y_train)
    test_prediction = model.predict(X_test)
    if compute_permutation:
        permutation_train = model.compute_fi_permutation(X_train,
                                                         y_train).to_dict()
        permutation_test = model.compute_fi_permutation(X_test,
                                                        y_test).to_dict()
    else:
        empty_dict = Series({col: nan for col in original_dtypes})
        permutation_train = empty_dict
        permutation_test = empty_dict

    is_classification = len(unique(y)) == 2
    if is_classification:
        probabiliteis = model.predict(X_test)
        logloss = log_loss(y_test, probabiliteis)
    else:
        logloss = nan

    fi = Series(model.compute_feature_importance(method='gain'))
    fi = normalize_series(fi).to_dict()
    results = dict(model=f"{model_name}",
                   ntrees=len(model.trees),
                   leaves=[tree.n_leaves for tree in model.trees],
                   nleaves=sum([tree.n_leaves for tree in model.trees]),
                   logloss=logloss,
                   gain=fi)

    if save_results:
        DataFrame(Series(results)).T.to_csv(exp_results_path)
    print(logloss)
    for tree in model.trees:
        try:
            tree_vis = TreeVisualizer()
            tree_vis.plot(tree)
        except:
            return
def get_feature_importance(exp, path, loo):
    all_cols = [
        'gain', 'permutation_train', 'permutation_test', 'shap_train',
        'shap_test'
    ]
    our_cols = ['gain', 'permutation_train', 'permutation_test']
    cols = our_cols if exp.startswith('Ours') else all_cols
    df = pd.read_csv(path, converters={col: literal_eval for col in cols})
    results = pd.DataFrame()
    for col in cols:
        results[col] = pd.Series(df.loc[0, col])
    results = results.sort_index()
    if loo:
        results['loo'] = get_loo_fi(path, results.index.tolist())
        results['loo'] = normalize_series(results['loo'])
    return results
    def compute_fi_permutation(self, X, y):
        results = {col: 0 for col in self.x_train_cols}
        true_error = self.compute_error(X, y)
        # get only features that got positive fi_gain
        gain_fi = self.compute_fi_gain()
        positive_fi_gain = gain_fi[gain_fi > 0].index.tolist()
        for col in positive_fi_gain:
            start = time()
            args = [(X, y, col, self.compute_error)
                    for _ in range(N_PERMUTATIONS)]

            with multiprocessing.Pool(4) as process_pool:
                prm_results = process_pool.starmap(worker, args)

            results[col] = mean(array(prm_results)) - true_error
            end = time()
            print(f"{col} run took {end - start}")
        fi = Series(results)
        return normalize_series(fi)
 def compute_fi_gain(self):
     fi = Series(self.predictor.compute_feature_importance(method='gain'))
     return normalize_series(fi)
 def compute_fi_gain(self):
     # TODO: fix it
     fi = dict(zip(self.x_train_cols, self.predictor.feature_importances_))
     fi = Series(self.group_fi(fi))
     return normalize_series(fi)
 def compute_fi_gain(self):
     # TODO: fix it
     fi = Series(self.predictor.feature_importances_,
                 index=self.predictor.feature_names_)
     return normalize_series(fi)