示例#1
0
def run_exp_02_logistic_regression_pd(model_path, retrain_model=False):
    """
    Linear Model Experiment
    """

    # local preprocessing (e.g., OHE for linear models)
    X, y = dataloader.get_train_ohe(dask=False)
    X.fillna(0, inplace=True)
    """
    Fit model to training data
    """
    from project.experiments.training_scripts.train_02_logistic_regression import train_logistic_regression
    train_logistic_regression(X,
                              y,
                              save_to=model_path,
                              recompute=retrain_model)
    """
    Test performance on unseen data
    """
    X, y = dataloader.get_test_ohe(dask=False)
    X.fillna(0, inplace=True)

    from project.experiments.testing_scripts.test_classification import test_classification
    test_classification(X, y, model_path)
    """
    Get interpretation - simple weights for linear models
    """
    import eli5
    import joblib
    explanation_df = eli5.explain_weights_df(joblib.load(model_path),
                                             feature_names=X.columns.values)
    explanation_df.sort_values('weight', inplace=True)
    print(explanation_df.head())
def main():
    start_time = time.time()
    train = read_train_data(nrows=None)
    test = read_test_data()

    train, test = process_data(train, test)
    X = train.drop(['ID_code', 'target'], axis=1)
    y = train['target']
    X_test = test.drop(['ID_code'], axis=1)
    model = lgb.LGBMClassifier(**params, n_estimators=20000, n_jobs=10)
    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          stratify=y)
    model.fit(X_train,
              y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              verbose=1000,
              early_stopping_rounds=200)
    perm = PermutationImportance(model, random_state=1).fit(X_valid, y_valid)
    eli_df = eli5.explain_weights_df(perm,
                                     feature_names=X.columns.tolist(),
                                     top=len(X.columns))
    eli_df.to_csv(
        os.path.join(data.permutation_importance.__path__[0],
                     '0304_square_feature.csv'))
    elapsed_time = time.time() - start_time
    print(elapsed_time)
示例#3
0
def permutation_importance(model, x_all, targets_all, config):
    _logger.info("Computing permutation importance!!")
    if config.algorithm not in transformed_modelmaps.keys():
        raise AttributeError("Only the following can be used for permutation "
                             "importance {}".format(
            list(transformed_modelmaps.keys())))

    y = targets_all.observations

    classification = hasattr(model, 'predict_proba')

    if not classification:
        for score in ['explained_variance',
                      'r2',
                      'neg_mean_absolute_error',
                      'neg_mean_squared_error']:
            pi_cv = apply_multiple_masked(
                PermutationImportance(model, scoring=score,
                                      cv='prefit', n_iter=10,
                                      refit=False).fit, data=(x_all, y)
            )
            feature_names = geoio.feature_names(config)
            df_picv = eli5.explain_weights_df(
                pi_cv, feature_names=feature_names, top=100)
            csv = Path(config.output_dir).joinpath(
                config.name + "_permutation_importance_{}.csv".format(
                    score)).as_posix()
            df_picv.to_csv(csv, index=False)
示例#4
0
def permutation_importance(my_model, val_X, val_y, ret_df=False):
    import eli5
    from eli5.sklearn import PermutationImportance
    perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
    if ret_df:
        return eli5.explain_weights_df(perm, feature_names=val_X.columns.tolist())
    else:
        return eli5.show_weights(perm, feature_names=val_X.columns.tolist())
示例#5
0
def show_global_interpretation_eli5(X_train, y_train, features, clf,
                                    dim_model):
    """show most important features via permutation importance in ELI5"""
    if dim_model == "XGBoost":
        df_global_explain = eli5.explain_weights_df(
            clf, feature_names=features.values, top=5).round(2)
    else:
        perm = PermutationImportance(clf, n_iter=2,
                                     random_state=1).fit(X_train, y_train)
        df_global_explain = eli5.explain_weights_df(
            perm, feature_names=features.values, top=5).round(2)
    bar = (alt.Chart(df_global_explain).mark_bar(
        color="red", opacity=0.6,
        size=16).encode(x="weight",
                        y=alt.Y("feature", sort="-x"),
                        tooltip=["weight"]).properties(height=160))
    st.write(bar)
def calculate_pfi(rf, X, y):
    """Calculate the PFI."""
    rf.n_jobs = get_ncpus()
    perm_importance = eli5.sklearn.PermutationImportance(rf,
                                                         random_state=1).fit(
                                                             X, y)
    return eli5.explain_weights_df(perm_importance,
                                   feature_names=list(X.columns))
示例#7
0
def get_feature_importance(model):

    feature_importances = eli5.explain_weights_df(model['chosen_model'])

    if feature_importances is None:
        return list()

    else:
        return feature_importances.to_dict(orient='records')
示例#8
0
def test_explain_weights(boston_train):
    X, y, feature_names = boston_train
    reg = LinearRegression()
    reg.fit(X, y)
    expl = explain_weights(reg)
    df = format_as_dataframe(expl)
    check_targets_dataframe(df, expl)
    check_targets_dataframe(explain_weights_df(reg), expl)
    df_dict = explain_weights_dfs(reg)
    assert set(df_dict.keys()) == {'targets'}
    check_targets_dataframe(df_dict['targets'], expl)
示例#9
0
    def analyze_fi_pi(self):
        "Feature Importance - Permutation Importance"

        # we need to impute the data first before calculating permutation importance
        train_X_imp = self.imputer.transform(self.X)
        # set up the met-estimator to calculate permutation importance on our training
        # data
        perm_train = PermutationImportance(self.estimator,
                                           scoring=self.spearman_scorer,
                                           n_iter=50,
                                           random_state=RANDOM_STATE)
        # fit and see the permuation importances
        perm_train.fit(train_X_imp, self.y)
        eli5.explain_weights_df(perm_train, feature_names=self.features)

        # plot the distributions
        perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_,
                                              columns=self.features)
        sns.boxplot(data=perm_train_feat_imp_df).set(
            title='Permutation Importance Distributions (training data)',
            ylabel='Importance')
示例#10
0
    def global_feautures_b(self, data):
        #data = self._data
        final = pd.DataFrame.from_dict({'feature' : list(data.columns) + ['<BIAS>']})
        for model in self._models:
            #print(model)
            #print(self._model[str(model)
            final_features = eli5.explain_weights_df(self._models[str(model)], feature_names= list(data))
            final_features = final_features[['feature','weight']].set_index('feature')
            final_features = final_features.rename(columns={'weight':model})
            final_features.reset_index(drop=True, inplace=True)
            final = final.join(final_features)
        #efor

        return(final)
示例#11
0
def main():
    df = pd.read_csv('/users/zcb/desktop/num_test/train.csv')
    label = df.columns
    print(label)
    df_target = df.iloc[:, 0]
    df_feature = df.iloc[:, 1:]

    x_train, x_test, y_train, y_test = train_test_split(df_feature,
                                                        df_target,
                                                        train_size=0.7,
                                                        random_state=0)
    my_model = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)
    perm = PermutationImportance(my_model, random_state=1).fit(x_test, y_test)
    df = eli5.explain_weights_df(perm)
    print(df)
示例#12
0
    def analyze_fi_mdi(self):
        "Feature Importance - Mean Decrease Impurity"

        feat_imp_df = eli5.explain_weights_df(self.estimator,
                                              feature_names=self.features)

        #import pdb; pdb.set_trace()
        # get the feature importances from each tree and then visualize the
        # distributions as boxplots
        all_feat_imp_df = pd.DataFrame(
            data=[tree.feature_importances_ for tree in self.estimator],
            columns=self.features)

        sns.boxplot(data=all_feat_imp_df).set(
            title='Feature Importance Distributions', ylabel='Importance')
def perm_import(
    model,
    X_val,
    y_val,
    score,
    return_importances=False,
):

    # Load up model

    ml_model = pickle.load(open(model, 'rb'))
    perm = PermutationImportance(ml_model, scoring=score,
                                 random_state=1).fit(X_val, y_val)
    feat_name = X_val.columns.tolist()
    eli5_show_weights = eli5.show_weights(perm, feature_names=feat_name)

    importances = eli5.explain_weights_df(perm, feature_names=feat_name)

    if return_importances == True:
        return importances
    def _get_permutation_importances(self, pipeline, X_train, Y_train):
        if not self.configs['fit'].get('permutation'):
            return None

        if X_train.ndim > 2:
            return None

        if len(self.feature_columns) > 50:
            logger.warning('COLUMNS IS TOO LARGE, THEN NO PERMUTATION')
            return None

        _estimator = pipeline.steps[-1][1]
        if not hasattr(_estimator, 'score'):
            logger.warning('NO SCORE METHOD, THEN NO PERMUTATION')
            return None

        perm = PermutationImportance(_estimator, random_state=42).fit(
            self.toarray_like(X_train), Y_train)
        return eli5.explain_weights_df(perm,
                                       feature_names=self.feature_columns)
def get_feature_importance(model, features):
    """Calculates feature importance of classifier using eli5 `explain_weights` method

    Parameters
    ----------
    model : dict
        Carries the model class

    Returns
    -------
    list of dicts
        Response structured as list of dicts with keys: 'feature' and 'weight'
    """

    feature_importances = eli5.explain_weights_df(model['chosen_model'],
                                                  feature_names=list(
                                                      features.columns))

    if feature_importances is None:
        return list()

    else:
        return feature_importances.to_dict(orient='records')
示例#16
0
def get_permutation_imp(m,
                        X,
                        y,
                        feats,
                        random_state=random_state,
                        scoring='roc_auc'):
    perm_train = PermutationImportance(m,
                                       random_state=random_state,
                                       scoring=scoring)
    _ = perm_train.fit(X, y)
    all_feat_imp_df = eli5.explain_weights_df(perm_train, feature_names=feats)

    perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_,
                                          columns=feats)
    perm_train_feat_imp_df = perm_train_feat_imp_df[list(
        all_feat_imp_df.feature)]
    ax = perm_train_feat_imp_df.iloc[:, :15].boxplot(figsize=(9, 7))
    ax.set(title='Permutation Importance Distributions (training data)',
           ylabel='Importance')
    plt.xticks(rotation=90)
    plt.show()
    display(all_feat_imp_df[:15])

    return all_feat_imp_df
def analyze_feature_importance(pipelines: List[Pipeline],
                               serialization_dir: Path,
                               logger: Logger):
    feature_importances = []
    for i, p in enumerate(pipelines):
        # Analyze feature importance: This is quite a "scrapy" endeavour. We need to obtain the feature names in order (for
        # which sklearn provides no method) and we need to find the classifier step in the pipeline (again, no method).
        feature_names = get_feature_names_from_pipeline(p)
        classifier = get_named_component_of_pipeline(p, CLASSIFIER_PIPELINE_STEP_NAME)
        assert classifier is not None, "Pipeline broken? Could not find classifier."
        assert type(classifier) is PredictOnTransformClassifierWrapper, "Unexpected pipeline step type."
        try:
            feature_importance = eli5.explain_weights_df(classifier.classifier_, feature_names=feature_names)
            if feature_importance is None:
                raise ValueError
        except Exception as e:
            logger.warning(f"Could not determine feature importance for {repr(type(classifier.classifier_))}.", e)
            continue
        feature_importance["run"] = i
        feature_importances.append(feature_importance)

    if not feature_importances:
        logger.warning("No feature importances found.")
        return
    feature_importances = pd.concat(feature_importances)

    # write raw data to file
    all_importances_file = serialization_dir / "feature_importances.csv"
    feature_importances.to_csv(all_importances_file)

    # average weight by run and write it to file
    importances_aggregated = feature_importances.groupby("feature")["weight"].describe(percentiles=[])
    importances_aggregated.sort_values("mean", ascending=False, inplace=True)
    aggregated_importances_file = serialization_dir / "feature_importances_aggregated.txt"
    with aggregated_importances_file.open("w") as f:
        f.write(tabulate(importances_aggregated, headers="keys"))
def explain_classifiers(X,y, df, feature_cols, stop_words,n=20):

	lr_txt = LogisticRegression(
			random_state=42,
			warm_start=True,
			C = 10,
			class_weight='balanced',
			solver="newton-cg",	
			penalty="l2",
		)
	rf = RandomForestClassifier(
				random_state=42,
				max_features="sqrt",
				max_depth= 10,
				n_estimators=1000 
	)
	lr_cat = LogisticRegression(class_weight='balanced',
			C = 0.06,
			warm_start=True,
			random_state=42)

	X_text = df['relevant_reviews']
	vect = TfidfVectorizer(stop_words=stop_words, norm="l2",  max_df=0.6, max_features=1000)
	X_vect = vect.fit_transform(X_text)
	feature_df = df[[col for col in X if col in feature_cols]]
	X_feat = pd.concat([feature_df],axis=1)

	#LR cat Feature Importance 
	rcParams.update({'figure.autolayout': True})
	lr_cat.fit(X_feat, y)
	weights = list(zip(lr_cat.coef_[0], X_feat.columns))
	weights.sort(reverse=True)
	weights_df = pd.DataFrame(weights[:20], columns=['weight', 'feature'])
	sns.set_context("talk")
	c = sns.barplot(x="weight", y='feature', data=weights_df, palette="Set3")
	plt.title("Yelp Resteraunt Features Associated With Resteraunt Inspection Failures \n (Positive weights imply increased risk of failing.)")
	plt.xlabel("Logistic Regression Weights")
	plt.ylabel("Feature")
	plt.xticks(rotation=90)
	plt.show()

	#RF Feature Importance 
	rcParams.update({'figure.autolayout': True})
	rf.fit(X_feat, y)
	weights = list(zip(rf.feature_importances_, X_feat.columns))
	weights.sort(reverse=True)
	weights_df = pd.DataFrame(weights[:20], columns=['weight', 'feature'])
	sns.set_context("talk")
	c = sns.barplot(x="weight", y='feature', data=weights_df, palette="Set3")
	plt.title("Yelp Resteraunt Features Associated With Resteraunt Inspection Failures \n (Larger weights imply increased importance)")
	plt.xticks(rotation=90)
	plt.xlabel("Random Forest Weights")
	plt.ylabel("Feature")
	plt.show()


	rcParams.update({'figure.autolayout': False})
	lr_txt.fit(X_vect,y)
	weights_df = eli5.explain_weights_df(lr_txt, vec=vect, top=20,target_names=y)
	sns.set_context("talk")
	b = sns.barplot(x="feature", y='weight', data=weights_df, palette="Set3")
	plt.xlabel('Word')
	plt.title("Yelp Resteraunt Features Associated With Resteraunt Inspection Failures \n (Positive weights imply increased risk of failing.)")
	plt.ylabel("Logistic Regression Weights")
	plt.xticks(rotation=45)
	plt.show()
示例#19
0
    y_train = df.iloc[:, 0].values - 1
    f_names = df.columns[1:].values
    t_names = df.iloc[:, 0].unique()
    # 不同 Class 统计 (根据 Target 列)
    print('\nTraining dataset shape: ', X_train.shape, ' Number of features: ',
          X_train.shape[1])
    num_categories = np.unique(y_train).size
    sum_y = np.asarray(np.unique(y_train.astype(int), return_counts=True))
    df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None)
    print('\n', df_sum_y)

    # 初始化 classifier 并完成数据集训练
    clf = RandomForestClassifier(verbose=1,
                                 n_jobs=-1,
                                 random_state=args.randomseed,
                                 n_estimators=100).fit(X_train, y_train)
    print('\nClassifier parameters:\n')
    print(clf.get_params())

    # 输出重要特征评分
    df_import = eli5.explain_weights_df(clf,
                                        target_names=t_names,
                                        feature_names=f_names)
    df_import.to_csv('f_weight_output.csv', index=None)
    print(
        "\nThe importance features have been saved to 'f_weight_output.csv'.")

    end_time = time.time()  # 程序结束时间
    print('\n[Finished in: {0:.6f} mins = {1:.6f} seconds]\n'.format(
        ((end_time - start_time) / 60), (end_time - start_time)))
示例#20
0
def examine_top_weights(df, clf, vec, CONTEXT_THRESH, top_n_domains,
                        n_features):

    # Get top feature weights
    weights = eli5.explain_weights_df(clf, vec=vec)

    # Take the absolute value to distinguish between positive and negative predictors
    weights['weight'] = np.absolute(weights['weight'])

    # Sort from largest to smallest
    weights.sort_values('weight', inplace=True, ascending=False)

    # Get the percentage of representation of the top n domains
    base_props = df['domain'].value_counts(normalize=True)[0:top_n_domains]

    # Iterate over top n features
    for i in range(0, n_features):

        # Get feature weight
        row = weights.iloc[i]

        # Get feature name
        feature = row['feature']

        # Find domains whose articles contain top features and get the proportion of articles per domain which have it
        props = df[(df['preprocessed_text'].str.contains(
            " {} ".format(feature), regex=False)) & (
                df['domain'].isin(base_props.keys()))]['domain'].value_counts(
                    normalize=True)

        # Iterate over domains
        for key in props.keys():

            # If it shows up more than expected
            if props[key] > 2 * base_props[key]:

                # Examine forward and backward context of term, print common contexts
                context = find_common_context_windows(feature, key, df,
                                                      'preprocessed_text', 1)
                forward_context = context[0].value_counts(normalize=True)
                if forward_context[0] > CONTEXT_THRESH:
                    print(feature.upper())
                    print(
                        key,
                        "- Observed: {:.3f}, Expected: {:.3f}, Difference: {:.3f}"
                        .format(props[key], base_props[key],
                                props[key] / base_props[key]))
                    print(forward_context[forward_context > CONTEXT_THRESH])
                    print('\n')
                backward_context = context[2].value_counts(normalize=True)
                if backward_context[0] > CONTEXT_THRESH:
                    print(feature.upper())
                    print(
                        key,
                        "- Observed: {:.3f}, Expected: {:.3f}, Difference: {:.3f}"
                        .format(props[key], base_props[key],
                                props[key] / base_props[key]))
                    print(backward_context[backward_context > CONTEXT_THRESH])
                    print('\n')

        print('\n')
示例#21
0
  plt.clim(0.003,0.010)
  plt.colorbar()
  plt.show

#permutation feature weights

import eli5
from eli5 import format_as_image
from eli5.sklearn import PermutationImportance
from sklearn.neural_network import MLPClassifier
NNMLP_clf = MLPClassifier(random_state=48, max_iter=50)
NNMLP_clf.fit(new_last_conv1, y_test1[:])

perm_all = PermutationImportance(NNMLP_clf).fit(new_last_conv1, y_test1)
print('CNN results')
exp = eli5.explain_weights_df(perm_all, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

perm_corr = PermutationImportance(NNMLP_clf).fit(new_last_conv1[correct_cnn[:]], y_test1[[correct_cnn[:]]])
print('CNN Correct results')
exp_corr = eli5.explain_weights_df(perm_corr, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

perm_mis = PermutationImportance(NNMLP_clf).fit(new_last_conv1[misclass_cnn[:]], y_test1[misclass_cnn[:]])
print('CNN Misclass results')
exp_mis = eli5.explain_weights_df(perm_mis, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

from sklearn.preprocessing import normalize

n0= normalize(final_last_conv1[correct_cnn[:]])
n1= normalize(final_last_conv1[misclass_cnn[:]])
n2= normalize(X_test1[:,:,0,0])
示例#22
0
# 7.11.3 Conclude: Get feature weights

"""
# If you are using jupyter notebook, use:

eli5.show_weights(
                  perm,
                  feature_names = colnames      # X_test.columns.tolist()
                  )


"""

fw = eli5.explain_weights_df(
                  perm,
                  feature_names = colnames      # X_test.columns.tolist()
                  )

# 7.11.4 Print importance
fw


##################### EE. Randomized Search #################

# Tune parameters using randomized search
# 8. Hyperparameters to tune and their ranges
parameters = {'xg__learning_rate':  uniform(0, 1),
              'xg__n_estimators':   range(50,300),
              'xg__max_depth':      range(3,10),
              'pca__n_components' : range(20,30)}
示例#23
0
def permutation_importance(model, val_X, val_y, path):
    perm = PermutationImportance(model, random_state=1).fit(val_X, val_y)
    Table = eli5.explain_weights_df(perm, feature_names=val_X.columns.tolist())
    Table.to_csv(path)
    print('generate ' + path)
示例#24
0
misclass_gbc = np.where(y_pred_gbc!=y_test)
misclass_gbc = misclass_gbc[0].tolist()
print(misclass_gbc)

correct_gbc = np.where(y_pred_gbc==y_test)
correct_gbc = correct_gbc[0].tolist()
print(correct_gbc)

import eli5
from eli5.sklearn import PermutationImportance
from IPython.display import display

perm_gbc = PermutationImportance(gbc_clf).fit(X_test, y_test)
print('GBC Results')
exp_gbc = eli5.explain_weights_df(perm_gbc, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

perm_gbc = PermutationImportance(gbc_clf).fit(X_test[correct_gbc[:]], y_test[correct_gbc[:]])
print('GBC Correct Results')
exp_gbc_corr = eli5.explain_weights_df(perm_gbc, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

perm_gbc = PermutationImportance(gbc_clf).fit(X_test[misclass_gbc[:]], y_test[misclass_gbc[:]])
print('GBC Misclass Results')
exp_gbc_mis = eli5.explain_weights_df(perm_gbc, feature_names = [0,1,2,3,4,5,6,7,8,9,10])

misclass_ada = np.where(y_pred_ada!=y_test)
misclass_ada = misclass_ada[0].tolist()
print(misclass_ada)

correct_ada = np.where(y_pred_ada==y_test)
correct_ada = correct_ada[0].tolist()
示例#25
0
# importance in decreasing order
imp_ord = np.argsort(perm.feature_importances_)

plt.figure(figsize=(12, 20))
yaxis = np.arange(len(perm.feature_importances_)) * 1.2
plt.barh(y=yaxis, width=perm.feature_importances_[imp_ord])
plt.yticks(yaxis, feature_names[imp_ord])
plt.ylabel('Feature')
plt.xlabel('Importance')
plt.show()

# ## Select the top 100 important terms to run the model

# In[15]:

WeightDF = eli5.explain_weights_df(perm, feature_names=feature_names)
WeightDF.head(100)

# In[16]:

imp_100 = WeightDF["feature"][0:100].tolist()
impX_100_DF = tfidfDF[imp_100].copy()
impX_100_DF_Testing = tfidfTestingDF[imp_100].copy()
impX_100_DF.head()

# #### 1.2 MLP-NN Model with the top 100 most important variables(Terms)
#
#

# In[17]:
示例#26
0
def make_eli5_interpretation(training_set, target, model, features, X,
                             ml_name):
    """to display most important features via permutation in eli5
    and sklearn formats"""
    # Permutation importances by eli5
    perm = PermutationImportance(model, n_iter=1,
                                 random_state=0).fit(training_set, target)
    df_explain = explain_weights_df(perm, feature_names=features,
                                    top=10).round(3)
    bar = (alt.Chart(df_explain,
                     title=f'ELI5 Weights Explained from {ml_name}').mark_bar(
                         color="red", opacity=0.6,
                         size=14).encode(x="weight",
                                         y=alt.Y("feature", sort="-x"),
                                         tooltip=["weight"
                                                  ]).properties(height=300,
                                                                width=675))
    st.markdown("#### ELI5 Weights Explained")
    info_global = st.button("How it is calculated")
    if info_global:
        st.info("""
            Each feature importance is obtained from permutation importances.
            Also, all the features are randomly shuffled and it shows how
            much impact the model perfomance used decreases.

            The eli5 plot is only displaying the top 10 features.

            For more information, check out this free course at kaggle:
            [Link](https://www.kaggle.com/dansbecker/permutation-importance)

            To check out the eli5 documentation, click the link:
            [ELI5 Documentation](
                https://eli5.readthedocs.io/en/latest/overview.html
                )
            """)
    st.write(bar)

    st.markdown("#### Permutation Importances")
    info_local = st.button("Information")
    if info_local:
        st.info("""
            The sklearn plot is displaying all the features in the dataset.
            It shows which are the least important to the most important
            features.

            For more information, check out this free course at kaggle:
            [Link](https://www.kaggle.com/dansbecker/permutation-importance)

            To check out the sklearn documentation, click the link:
            [Sklearn Documentation](
                https://scikit-learn.org/stable/modules/permutation_importance.html
                )
            """)
    # Permutation importances by sklearn
    imp = permutation_importance(model, training_set, target, random_state=0)

    data = {
        'importances_mean': imp['importances_mean'],
        'importances_std': imp['importances_std']
    }
    imp = pd.DataFrame(data, index=X.columns)
    imp.sort_values('importances_mean', ascending=False, inplace=True)

    fig, ax = plt.subplots(figsize=(12, 16))
    imp.importances_mean.plot(kind='barh', ax=ax)
    plt.title('Sklearn Permutation Importances',
              fontsize=14,
              fontweight='bold')
    plt.xlabel(ml_name, fontsize=12)

    plt.tight_layout()
    st.write(fig)
示例#27
0
    'Dataset': ['Raw data', 'Dataset 1', 'Dataset 2'],
    'R^2 score': [scoreR, score1, score2],
    'Best params': [None, None, None]
}
pd.DataFrame(lrDict)

# In[ ]:

lr = LinearRegression(n_jobs=njobs, normalize=True)
lr.fit(X_trnR, Y_trnR)

# Extracting weights of features (not normalized)

# In[ ]:

weights = eli5.explain_weights_df(
    lr)  # weights of LinearRegression model for RawData
rank = [int(i[1:]) for i in weights['feature'].values[1:]]
labels = ['BIAS'] + [X_trnR.columns[i] for i in rank]
weights['feature'] = labels
weights

# #### KNeighbors <a name='knn'></a>

# KNeighbors Regressor requires more parameters than Linear Regression, so using GridSearchCV to tune hyperparameters seem to be good idea.

# In[ ]:

tuned_parameters = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance']
}
plt.ylabel('features', fontsize = 20)

# More balanced results - its use is not extremely spread.



### Permutation feature importance - (or Mean Decrease Accuracy)

# Import special library, designed for interpretation tasks
import eli5
from eli5.sklearn import PermutationImportance

#Fit and see permutation importance on our training data
perm_train = PermutationImportance(classifier)
perm_train.fit(X_train, y_train)
eli5.explain_weights_df(perm_train, feature_names=features)

#Fit and see permutation importance on our test data
perm_test = PermutationImportance(classifier)
perm_test.fit(X_test, y_test)
eli5.explain_weights_df(perm_test, feature_names=features)

# For this method, it is not clear on what set it should be applied. 
# In both cases, we can observe a table where features are ranked according to their importance. 
# The output takes the form of a weight (along with a standard deviation measure)



# Results vary according to the method used. Take into account limitations / biases of each method 
# Combinining them allows to get a more objective view of true feature importance, which is a great explanation factor. 
# Cross compare with deductions made during the data visualisation phase, where each feature's impact on churn was more or less assessed. 
示例#29
0
print(mapk(actual, pred))

# -----------------------------------------------------------------------------
# MODEL INTERPRETABILITY

import seaborn as sns
import matplotlib.pyplot as plt
import lime
import shap

from eli5.sklearn import PermutationImportance
from eli5 import explain_weights_df, explain_prediction_df
from lime.lime_tabular import LimeTabularExplainer

# eli5
feat_imp_df = explain_weights_df(model, feature_names=all_cols)
feat_imp_df.head(10)

X_train = train.values
exp_pred_df = explain_prediction_df(estimator=model,
                                    doc=X_train[0],
                                    feature_names=all_cols)

# lime
explainer = LimeTabularExplainer(X_train,
                                 mode='regression',
                                 feature_names=all_cols,
                                 categorical_features=cat_cols,
                                 random_state=1981,
                                 discretize_continuous=True)
exp = explainer.explain_instance(X_valid[10], model.predict, num_features=20)
示例#30
0
def Run_RF(allLabels, allFeatures, valLabels, valFeatures, outputPrefix,
           repetitions, group, header, outputDir, version, data_id, gini,
           perm):
    #
    defaultFI = [0] * len(valFeatures[0])
    oob = []
    acc = []
    o = [0] * len(valLabels)
    ##
    allLabels_pridf = DataFrame(allLabels)
    allLabels_pridf.columns = ["label"]
    allLabels_df = allLabels_pridf['label'].apply(str)
    valLabels_pridf = DataFrame(valLabels)
    valLabels_pridf.columns = ["label"]
    valLabels_df = valLabels_pridf['label'].apply(str)
    ##
    allFeatures_df = DataFrame(allFeatures, dtype=float)
    valFeatures_df = DataFrame(valFeatures, dtype=float)
    allFeatures_df.columns = header
    valFeatures_df.columns = header
    valFeatures_arr = np.array(valFeatures_df, dtype="float32")
    ##
    for rep in range(0, repetitions):
        print("@@@@       Repetition " + str(rep) + " is started " +
              time.asctime(time.localtime(time.time())) + "@@@@      ")
        # print("@@@@       Start fitting model. " + time.asctime(time.localtime(time.time())) + " @@@@      ")
        rfc = RandomForestClassifier(n_estimators=100, oob_score=True)
        rfc.fit(allFeatures_df, allLabels_df)
        print("@@@@       Fitting model is done! " +
              time.asctime(time.localtime(time.time())) + "       @@@@")
        print(rfc.oob_score_)
        oob.append(rfc.oob_score_)
        predictions = rfc.predict(valFeatures)
        #
        i = 0
        correct = 0
        incorrect = 0
        for x in predictions:
            if x == valLabels[i]:
                correct += 1
            else:
                incorrect += 1
            i += 1
        #
        M = rfc.predict_proba(valFeatures)
        acc.append(correct / float(correct + incorrect))
        #
        k = 0
        for entry in M:
            o[k] += entry[1]
            k += 1
        ## default ##
        defaultFI = [
            x + y for x, y in zip(defaultFI, rfc.feature_importances_)
        ]
        # perm
        if perm == "perm":
            print("@@@@       perm FI is started. " +
                  time.asctime(time.localtime(time.time())) + "       @@@@")
            # # rfpimp
            # rfppermFIOutput=outputDir+"/FI_rfpperm_" + str(rep) + "_" + outputPrefix+"_"+str(group)+"."+version+".txt"
            # output3=open(rfppermFIOutput, "w")
            # rfppermFI=[0]*len(valFeatures[0])
            # imp = permutation_importances(rfc, valFeatures_df, valLabels_df, oob_classifier_accuracy)
            # imp_neworder = imp.loc[header]
            # rfppermFI=[x + y for x, y in zip(rfppermFI, imp_neworder["Importance"])]
            # if perm == "perm":
            # 	m=0
            # 	for element in rfppermFI:
            # 		 output3.write(header[m] + "\t" + str(element) + "\n")
            # 		 m+=1
            #
            ## eli5 ##
            eli5permFIOutput = outputDir + "/FI_perm_rep" + str(
                rep) + "_" + outputPrefix + "_" + str(
                    group) + "." + version + ".txt"
            permFI = PermutationImportance(rfc, random_state=1).fit(
                valFeatures_df, valLabels_df)
            eli5_permFI_df = eli5.explain_weights_df(
                permFI, feature_names=valFeatures_df.columns.tolist())
            eli5_permFI_df.to_csv(eli5permFIOutput, sep='\t', index=False)
            print("@@@@       perm FI is done! " +
                  time.asctime(time.localtime(time.time())) + "       @@@@")
        # gini
        if gini == "gini":
            print("@@@@       gini FI is started. " +
                  time.asctime(time.localtime(time.time())) + "       @@@@")
            valginifiOutput = outputDir + "/FI_gini_rep" + str(
                rep) + "_" + outputPrefix + "_" + str(
                    group) + "." + version + ".txt"
            output4 = open(valginifiOutput, "w")
            ## valginiFI ##
            ## pos sample and neg sample ##
            pos_row = []
            neg_row = []
            pos_row = [
                i for i in range(len(valLabels_df)) if valLabels_df[i] == "1"
            ]
            neg_row = [
                i for i in range(len(valLabels_df)) if valLabels_df[i] == "0"
            ]
            # a = calculate_gini_impurity(rfc, 0, pos_row, neg_row, valLabels_df)
            ensemble_importances = dict(zip(header, [0] * len(header)))
            all_tree_importance = []
            pool = multiprocessing.Pool(processes=1)
            for tree_id in range(len(rfc.estimators_)):
                all_tree_importance.append(
                    pool.apply_async(calculate_gini_impurity,
                                     args=(
                                         rfc,
                                         tree_id,
                                         pos_row,
                                         neg_row,
                                         valLabels_df,
                                         valFeatures_arr,
                                         data_id,
                                     )))
            pool.close()
            pool.join()
            for i in range(len(rfc.estimators_)):
                for j in header:
                    ensemble_importances[j] += all_tree_importance[i].get()[j]
            for j in header:
                ensemble_importances[j] /= len(rfc.estimators_)
            ## gini FI ##
            if gini == "gini":
                k = 0
                for element in ensemble_importances:
                    output4.write(element + "\t" +
                                  str(ensemble_importances[element]) + "\n")
                    k += 1
            output4.close()
            print("@@@@       gini FI is done! " +
                  time.asctime(time.localtime(time.time())) + "       @@@@")
        print("@@@@       Repetition " + str(rep) + " is done! " +
              time.asctime(time.localtime(time.time())) + "       @@@@")
        print("\n")
    #
    print(numpy.mean(oob), "\t", numpy.mean(acc))
    #
    valOutput = outputDir + "/Predictions_" + outputPrefix + "_" + str(
        group) + "." + version + ".txt"
    defaultFIOutput = outputDir + "/FI_default_" + outputPrefix + "_" + str(
        group) + "." + version + ".txt"
    output1 = open(valOutput, "w")
    output2 = open(defaultFIOutput, "w")
    ## validation ##
    i = 0
    for entry in o:
        output1.write(
            str(valLabels[i]) + "\t" + str(entry / float(repetitions)) + "\n")
        i += 1
    ## defaultFI ##
    j = 0
    for element in defaultFI:
        output2.write(header[j] + "\t" + str(element / float(repetitions)) +
                      "\n")
        j += 1
    output1.close()
    output2.close()