Пример #1
0
def test_explain_linear_tuple_top(newsgroups_train):
    docs, y, target_names = newsgroups_train
    vec = TfidfVectorizer()
    clf = LogisticRegression(random_state=42)

    X = vec.fit_transform(docs)
    clf.fit(X, y)

    res_neg = explain_weights(clf,
                              vec=vec,
                              target_names=target_names,
                              top=(0, 10))
    expl_neg, _ = format_as_all(res_neg, clf)

    for target in res_neg.targets:
        assert len(target.feature_weights.pos) == 0
        assert len(target.feature_weights.neg) == 10

    assert "+0." not in expl_neg

    res_pos = explain_weights(clf,
                              vec=vec,
                              target_names=target_names,
                              top=(10, 2))
    format_as_all(res_pos, clf)

    for target in res_pos.targets:
        assert len(target.feature_weights.pos) == 10
        assert len(target.feature_weights.neg) == 2
Пример #2
0
def test_explain_linear_feature_filter(newsgroups_train, vec):
    clf = LogisticRegression(random_state=42)
    docs, y, target_names = newsgroups_train
    X = vec.fit_transform(docs)
    clf.fit(X, y)
    if isinstance(vec, HashingVectorizer):
        vec = InvertableHashingVectorizer(vec)
        vec.fit(docs)

    res = explain_weights(clf, vec=vec, feature_re='^ath')
    text_expl, _ = expls = format_as_all(res, clf)
    for expl in expls:
        assert 'atheists' in expl
        assert 'atheism' in expl
        assert 'space' not in expl
        assert 'BIAS' not in expl

    res = explain_weights(
        clf,
        vec=vec,
        feature_filter=lambda name: name.startswith('ath') or name == '<BIAS>')
    text_expl, _ = expls = format_as_all(res, clf)
    for expl in expls:
        assert 'atheists' in expl
        assert 'atheism' in expl
        assert 'space' not in expl
        assert 'BIAS' in expl
    assert '<BIAS>' in text_expl
Пример #3
0
def main(input_path=TRAIN_TEST_DIR, output_path=MODELS_DIR):
    output_path.mkdir(parents=True, exist_ok=True)
    X_train, X_test, y_train, y_test = ujob.load_multiple(
        input_path,
        ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib'])

    model = make_model()

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)

    # Evaluate the model via cross-validation
    cv_scores = cross_val_score(model,
                                X_train,
                                y_train,
                                cv=cv,
                                scoring='roc_auc')

    # Store and print evaluation metrics
    metrics = {}
    metrics['cv_scores'] = {}
    for i, value in enumerate(cv_scores.tolist(), start=1):
        metrics['cv_scores'][f'fold{i}'] = float(value)
    metrics['score_mean'] = float(np.mean(cv_scores))
    metrics['score_std'] = float(np.std(cv_scores))

    print(f'CV scores: {list(metrics["cv_scores"].values())}')
    print(f'Mean CV score: {metrics["score_mean"]}')
    print(f'Std CV score: {metrics["score_std"]}')

    # Fit the model on the whole dataset
    model.fit(X_train, y_train)

    # Show feature importances
    formatter = eli5.formatters.text.format_as_text
    print(
        formatter(
            eli5.explain_weights(model,
                                 feature_filter=lambda x: x.split('__')[0] ==
                                 'site_vectorizer')))
    print(
        formatter(eli5.explain_weights(
            model,
            feature_filter=lambda x: x.split('__')[0] != 'site_vectorizer'),
                  show=['targets']))

    # Calculate model errors
    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)
    errors_mask = (y_test_pred != y_test)
    errors = np.array([np.arange(X_test.shape[0]), y_test_pred,
                       y_test]).transpose()[errors_mask]
    errors_proba = y_test_pred_proba[errors_mask]

    # Export metrics and errors
    ujob.dump_multiple(
        [model, errors, errors_proba], output_path,
        ['model.joblib', 'errors.joblib', 'errors_proba.joblib'])
    with open(MODELS_DIR.joinpath('metrics.yaml'), 'w') as fout:
        yaml.dump(metrics, fout, sort_keys=False)
Пример #4
0
def test_explain_pipeline(predictor, transformer, X, feature_names,
                          explain_kwargs):
    y = [1, 0]
    expected = explain_weights(clone(predictor).fit([[1, 0], [0, 1]], y),
                               feature_names=['hello', 'world'],
                               **explain_kwargs)
    pipe = make_pipeline(transformer, clone(predictor)).fit(X, y)
    actual = explain_weights(pipe, feature_names=feature_names,
                             **explain_kwargs)
    assert expected._repr_html_() == actual._repr_html_()
Пример #5
0
def test_unsupported():
    vec = CountVectorizer()
    clf = BaseEstimator()
    res = explain_weights(clf, vec=vec)
    assert 'BaseEstimator' in res.error
    for expl in format_as_all(res, clf):
        assert 'Error' in expl
        assert 'BaseEstimator' in expl
    with pytest.raises(TypeError):
        explain_weights(clf, unknown_argument=True)
Пример #6
0
def test_explain_decision_tree_regressor_multitarget():
    X, y = make_regression(n_samples=100, n_targets=3, n_features=10,
                           random_state=42)
    reg = DecisionTreeRegressor(random_state=42, max_depth=3)
    reg.fit(X, y)
    res = explain_weights(reg)
    expl_text, expl_html = format_as_all(res, reg)

    assert 'x9' in expl_text
    assert '---> [' in expl_text
    assert '---> [[' not in expl_text

    assert res == explain_weights(reg)
Пример #7
0
def test_explain_linear_regression_multitarget(reg):
    X, y = make_regression(n_samples=100, n_targets=3, n_features=10,
                           random_state=42)
    reg.fit(X, y)
    res = explain_weights(reg)
    expl, _ = format_as_all(res, reg)

    assert 'x9' in expl
    assert '<BIAS>' in expl

    pos, neg = top_pos_neg(res, 'y2')
    assert 'x9' in neg or 'x9' in pos
    assert '<BIAS>' in neg or '<BIAS>' in pos

    assert res == explain_weights(reg)
Пример #8
0
def test_explain_lightgbm_booster(boston_train):
    xs, ys, feature_names = boston_train
    booster = lightgbm.train(
        params={
            'objective': 'regression',
            'verbose_eval': -1
        },
        train_set=lightgbm.Dataset(xs, label=ys),
    )
    res = explain_weights(booster)
    for expl in format_as_all(res, booster):
        assert 'Column_12' in expl
    res = explain_weights(booster, feature_names=feature_names)
    for expl in format_as_all(res, booster):
        assert 'LSTAT' in expl
Пример #9
0
def print_eli5(click_data, category):
    pred = pd.read_csv(category + '_xy.csv')
    model = joblib.load(category + ".h5")
    pred = pred.loc[(pred['grid_x'] == click_data['points'][0]['lon']) &
                    (pred['grid_y'] == click_data['points'][0]['lat']), :]
    pred_sqr = pred['eurogrid_0250_1'].values[0]
    dane_model = df.loc[df['eurogrid_0250_1'] == pred_sqr, :]
    dict_ = eli5.format_as_dataframe(eli5.explain_weights(model))
    cols = dict_['feature'].values
    maping = {}
    for i in range(len(cols)):
        maping['x' + str(i)] = cols[i]


#    print(dane_model.columns)
    expl = dane_model.loc[:, cols]
    #    print(expl.head())
    all_cols = itertools.permutations(cols)
    for cols in all_cols:
        try:
            expl = expl.loc[:, list(cols)]
            expl = eli5.formatters.format_as_dataframe(
                eli5.explain_prediction(model, expl))
            break
        except:
            continue
    expl['feature'] = expl['feature'].apply(lambda x: map_x(x, maping))
    return generate_table(expl)
Пример #10
0
def main(input_path=TRAIN_TEST_DIR, output_path=MODELS_DIR):
    output_path.mkdir(parents=True, exist_ok=True)
    X_train, X_test, y_train, y_test = ujob.load_multiple(
        input_path,
        ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib'])

    model = make_model()

    cv = StratifiedKFold(n_splits=5)

    cv_scores = cross_val_score(model,
                                X_train,
                                y_train,
                                cv=cv,
                                scoring='accuracy')
    print(f'CV scores: {cv_scores}')
    print(f'Mean CV score: {np.mean(cv_scores)}')

    model.fit(X_train, y_train)

    formatter = eli5.formatters.text.format_as_text
    print(formatter(eli5.explain_weights(model)))

    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)
    errors_mask = (y_test_pred != y_test)
    errors = np.array([np.arange(X_test.shape[0]), y_test_pred,
                       y_test]).transpose()[errors_mask]
    errors_proba = y_test_pred_proba[errors_mask]

    ujob.dump_multiple(
        [model, errors, errors_proba], output_path,
        ['model.joblib', 'errors.joblib', 'errors_proba.joblib'])
Пример #11
0
def explore_final_model():
  #https://github.com/gameofdimension/xgboost_explainer/blob/master/xgboost_explainer_demo.ipynb
  
  nr_labels = len(y)
  value_counts = y.value_counts()
  perc_per_label = {k:round(100 * v/float(nr_labels),2) for k,v in value_counts.items()}
  print('value counts:', y.value_counts())
  print('perc per label:', perc_per_label)

  model = pickle.load(open(filename_model, "rb"))
  model_feature_names = model.attr('feature_names').split('|')    
  index_to_class = json.loads(model.attr('index_to_class'))
  print(index_to_class)
  classes = [index_to_class[k] for k in sorted(index_to_class.keys())]
  print(classes)
  
  print('eli5 explain weights (gain):\n',eli5.format_as_text(eli5.explain_weights(model, top=10))) #gain
  
  df_test = pd.read_json(open(test_filename, "r"))
  df_test = df_test.head(5)
  feature_extractor = FeatureExtractor(df_test)
  X_test, X_test_featurenames = feature_extractor.get_features_pred_instances(df_test, model_feature_names)
  
  
  print(X)
  print(set(X.dtypes))
#   print(X.iloc[0])
  print(eli5.format_as_text(eli5.explain_prediction(model, X_test.head(1), target_names = classes, top = 10, feature_names = X_test_featurenames)))
Пример #12
0
def test_sklearn_crfsuite(xseq, yseq):
    crf = CRF(c1=0.0, c2=0.1, max_iterations=50)
    crf.fit([xseq], [yseq])

    expl = explain_weights(crf)
    text, html = format_as_all(expl, crf)

    assert "y='sunny' top features" in text
    assert "y='rainy' top features" in text
    assert "Transition features" in text
    assert "sunny   -0.130    0.696" in text
    assert u'+0.124  солнце:не светит' in text

    html_nospaces = html.replace(' ', '').replace("\n", '')
    assert u'солнце:не светит' in html
    assert '<th>rainy</th><th>sunny</th>' in html_nospaces

    try:
        from eli5 import format_as_dataframe, format_as_dataframes
    except ImportError:
        pass
    else:
        from .test_formatters_as_dataframe import check_targets_dataframe
        df_dict = format_as_dataframes(expl)
        check_targets_dataframe(df_dict['targets'], expl)
        df_transition = df_dict['transition_features']
        transition = expl.transition_features
        print(df_transition)
        assert list(transition.class_names) == ['rainy', 'sunny']
        assert np.isclose(df_transition['rainy']['rainy'], transition.coef[0,
                                                                           0])
        assert np.isclose(df_transition['sunny']['rainy'], transition.coef[0,
                                                                           1])
        assert np.isclose(df_transition['rainy']['sunny'], transition.coef[1,
                                                                           0])
Пример #13
0
def explain(model_path):
    wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
    sent_word2vec_path = "./data/word2vec.query.bin"
    sent_vocab_path = "./data/word2vec.query.vocab"
    sent_model_path = "./data/sif.model"

    sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path,
                                                      binary=True)
    sent_vocab_dict = load_vocab(sent_vocab_path)
    sent_model = joblib.load(sent_model_path)

    tfidf_count_hash_vectorModels = VectorModels()

    ner_dict_path = "./data/ner.dict"
    syn_dict_path = "./data/syn.dict"
    ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path)

    model = joblib.load(model_path)

    pd.set_option('display.max_rows', None)

    explain = eli5.explain_weights(model, top=None)
    explain = eli5.format_as_text(explain)
    print explain

    feature_names = []
    column_names = ["qid", "ql", "qr"]
    #reader = pd.read_csv(in_path, sep="\t", dtype="str", names=column_names, chunksize=100)
    reader = pd.read_csv(sys.stdin,
                         sep="\t",
                         dtype="str",
                         names=column_names,
                         chunksize=1)
    first_chunk = True
    feature_extractor = lambda row: extract_features(
        wordseg, row["ql"], row["qr"], tfidf_count_hash_vectorModels,
        sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict)
    for data in reader:
        _ = data.fillna("", inplace=True)

        X = data[["ql", "qr"]].apply(feature_extractor, axis=1)
        X_features = X.apply(pd.Series)
        feature_names = X_features.columns.values.tolist()
        X_features = X_features[feature_names]
        y_preds = model.predict_proba(X_features,
                                      ntree_limit=model.best_ntree_limit)
        y_preds = map(lambda o: o[1], y_preds)
        data = pd.concat([data, X_features], axis=1)
        data = data.assign(predict=y_preds)

        #if first_chunk:
        #    data.to_csv(in_path + ".predict", header=True, sep="\t", mode="w")
        #    first_chunk = False
        #else:
        #    data.to_csv(in_path + ".predict", header=False, sep="\t", mode="a")
        data.to_csv(sys.stdout, header=False, sep="\t")
        explain = eli5.explain_prediction(model, X_features.iloc[0])
        explain = eli5.format_as_text(explain)
        print explain
        print X_features.iloc[0]
Пример #14
0
def visualise_feature_importance(model, title_variable, x_test, y_test):
    print(
        eli5.format_as_text(
            eli5.explain_weights(
                PermutationImportance(model,
                                      random_state=42).fit(x_test, y_test))))

    importances = model.feature_importances_
    labels = [
        "X{} - {:4.1f}%".format(i, importances[i - 1] * 100)
        for i in range(1, 8)
    ]

    patches, texts = plt.pie(importances,
                             wedgeprops=dict(width=0.5),
                             startangle=90,
                             radius=1.2)

    plt.legend(patches,
               labels,
               prop={'size': 12},
               bbox_to_anchor=(0.74, 0.5),
               loc="center right",
               fontsize=8)
    plt.title("Feature Importance for {}".format(title_variable))
    plt.savefig("plots/fi{}.png".format(title_variable))
    plt.show()
Пример #15
0
    def _train_model(self, df_train):
        # type: (List[List[Tuple[Text, Text, Text, Text]]]) -> None
        """Train the crf tagger based on the training data."""
        import sklearn_crfsuite

        X_train = [self._sentence_to_features(sent) for sent in df_train]
        y_train = [self._sentence_to_labels(sent) for sent in df_train]
        self.ent_tagger = sklearn_crfsuite.CRF(
                algorithm='lbfgs',
                # coefficient for L1 penalty
                c1=self.component_config["L1_c"],
                # coefficient for L2 penalty
                c2=self.component_config["L2_c"],
                # stop earlier
                max_iterations=self.component_config["max_iterations"],
                # include transitions that are possible, but not observed
                all_possible_transitions=True
        )
        self.ent_tagger.fit(X_train, y_train)

        result = eli5.explain_weights(self.ent_tagger, top=10, target_names='name')
        plt = str(result.targets[1])
        #plt = plt.split("FeatureWeight")
        print(len(plt))
        print(plt)
        for i in range(len(plt)):
            weight_dict = []
            if "feature" in plt[i]:
                element = plt[i].split(',')
                weight_dict.append(element[0].replace('(',""))
                weight_dict.append(element[1])

        print(weight_dict)
def explainELI5(rows, name, classes, XY):
    import eli5
    import eli5.sklearn
    from eli5 import show_weights
    from eli5.sklearn import PermutationImportance
    from eli5.sklearn import explain_prediction_linear_classifier
    from eli5 import show_prediction

    cols = XY[2]

    clf = loadModel(name+"-"+classes)

    ExplainPath_Specific = ExplainPath + name + "_" + classes

    weights = eli5.explain_weights(clf, feature_names=cols, top=(len(cols)+1))
    predictionA = eli5.explain_prediction(clf, XY[0][0], feature_names=cols, top=(len(cols)+1))
    predictionB = eli5.explain_prediction(clf, XY[0][1], feature_names=cols, top=(len(cols)+1))

    html = eli5.format_as_html(weights)
    with open(uniquify(ExplainPath_Specific  + "/" + classes +"_ELI5_WEIGHTS.html"), 'w') as f:
        f.write(html)
        f.close()

    html = eli5.format_as_html(predictionA)
    with open(uniquify(ExplainPath_Specific  + "/" + classes +"_ELI5_PREDICTION_A.html"), 'w') as f:
        f.write(html)
        f.close()
    html = eli5.format_as_html(predictionB)
    with open(uniquify(ExplainPath_Specific  + "/" + classes +"_ELI5_PREDICTION_B.html"), 'w') as f:
        f.write(html)
        f.close()
Пример #17
0
def get_feature_importances(model, data_dict, trainvars, data="even"):
    """Returns the feature importance relevant for neural network case using
    the eli5 package. Note: calculating the feature importances takes a while
    due to it calculating all the permutations.

    Parameters:
    ----------
    model:The nn_model
        The NN model created by create_nn_model
    data_dict : dict
        Contains all the necessary information for the evaluation.

    Returns:
    -------
    feature_importances : dict
        The feature importances equivalent for nn using the eli5 package.
    """
    perm = PermutationImportance(model, scoring=mt.roc_curve).fit(
        data_dict[data+"_data"][trainvars].values,
        data_dict[data+"_data"]['multitarget'],
        sample_weight=data_dict[data+"_data"]['totalWeight']
    )
    weights = eli5.explain_weights(perm, feature_names=data_dict[trainvars])
    weights_df = format_as_dataframe(weights).sort_values(
        by='weight', ascending=False).rename(columns={'weight': 'score'})
    list_of_dicts = weights_df.to_dict('records')
    feature_importances = {}
    for single_variable_dict in list_of_dicts:
        key = single_variable_dict['feature']
        feature_importances[key] = single_variable_dict['score']
    return feature_importances
Пример #18
0
def get_feature_importances(model, data_dict):
    '''Returns the feature importance relevant for neural network case using
    the eli5 package. Note: calculating the feature importances takes a while
    due to it calculating all the permutations.

    Parameters:
    ----------
    model:The nn_model
        The NN model created by create_nn_model
    data_dict : dict
        Contains all the necessary information for the evaluation.

    Returns:
    -------
    feature_importances : dict
        The feature importances equivalent for nn using the eli5 package.
    '''
    perm = PermutationImportance(model).fit(data_dict['train'],
                                            data_dict['training_labels'])
    weights = eli5.explain_weights(perm, feature_names=data_dict['trainvars'])
    weights_df = format_as_dataframe(weights).sort_values(
        by='weight', ascending=False).rename(columns={'weight': 'score'})
    list_of_dicts = weights_df.to_dict('records')
    feature_importances = {}
    for single_variable_dict in list_of_dicts:
        key = single_variable_dict['feature']
        feature_importances[key] = single_variable_dict['score']
    return feature_importances
Пример #19
0
def performance_measurement(crf_model, x, y, g_sentences):
    """Utilizes different functions to measure the model's performance and saves the results to files for review."""
    # Cross-validating the model
    cross_val_predictions = cross_val_predict(estimator=crf_model,
                                              X=x,
                                              y=y,
                                              cv=5)
    report = flat_classification_report(y_pred=cross_val_predictions, y_true=y)
    file = open(
        f'results/performance_measurement_results_{datetime.datetime.today().date()}.txt',
        'a',
        encoding='utf-8')
    file.seek(0)
    file.truncate()
    print2both('created on:',
               str(datetime.datetime.today().date()),
               '\n',
               file=file)
    print2both('flat_classification_report:\n\n', report, '\n\n', file=file)
    print2both('cross_val_predict:\n\n',
               cross_val_predictions,
               '\n\n',
               file=file)
    # Showing the weights assigned to each feature
    print2both('eli5.explain_weights(crf, top=100):\n\n',
               eli5.format_as_text(eli5.explain_weights(crf_model, top=100)),
               '\n\n',
               file=file)
    file.close()
    # Saving the potentially correct and the incorrect classifications in separate CSV files for review
    categorize_predictions(gold_sents=g_sentences,
                           y_hat=cross_val_predictions,
                           y_actual=y)
Пример #20
0
def test_explain_weights_feature_names_pandas(boston_train):
    pd = pytest.importorskip('pandas')
    X, y, feature_names = boston_train
    df = pd.DataFrame(X, columns=feature_names)
    reg = LGBMRegressor().fit(df, y)

    # it shoud pick up feature names from DataFrame columns
    res = explain_weights(reg)
    for expl in format_as_all(res, reg):
        assert 'PTRATIO' in expl

    # it is possible to override DataFrame feature names
    numeric_feature_names = ["zz%s" % idx for idx in range(len(feature_names))]
    res = explain_weights(reg, feature_names=numeric_feature_names)
    for expl in format_as_all(res, reg):
        assert 'zz12' in expl
Пример #21
0
def weight_explainer(estimator,
                     num_features,
                     cat_features=None,
                     remainders=None):
    """
    input:  a pipeline object
    output: feature coefficients/weights 
    """
    import eli5

    if cat_features != None:
        onehot_cols = list(estimator.named_steps['transformer']\
                           .named_transformers_['categorical']\
                           .named_steps['encoder']\
                           .get_feature_names())

        num_features.extend(onehot_cols)

    elif remainders != None:
        num_features.extend(remainders)

    else:
        pass

    return eli5.explain_weights(estimator.named_steps['estimator'],
                                top=20,
                                feature_names=num_features)
Пример #22
0
def explain_weights_df(estimator, **kwargs):
    # type: (...) -> pd.DataFrame
    """ Explain weights and export them to ``pandas.DataFrame``.
    All keyword arguments are passed to :func:`eli5.explain_weights`.
    Weights of all features are exported by default.
    """
    kwargs = _set_defaults(kwargs)
    return format_as_dataframe(eli5.explain_weights(estimator, **kwargs))
Пример #23
0
def process_xgb():
    col, train, test, test_ref = load_data()
    print(train.shape, test.shape, test_ref.shape)

    params = {
        'colsample_bytree': 0.055,
        'colsample_bylevel': 0.4,
        'gamma': 1.5,
        'learning_rate': 0.01,
        'max_depth': 5,
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'min_child_weight': 10,
        'n_estimators': 1800,
        'reg_alpha': 0,
        'reg_lambda': 0,
        'eval_metric': 'rmse',
        'subsample': 0.7,
        'silent': True,
        'seed': 7,
    }
    folds = 20
    full_score = 0.0
    xg_test = xgb.DMatrix(test[col])
    use_regressor = True
    use_regressor = False
    for fold in range(folds):
        x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p(train.target.values), test_size=0.0010, random_state=fold)

        if use_regressor:
            p = params
            model = xgb.XGBRegressor(colsample_bytree=p['colsample_bytree'], colsample_bylevel=p['colsample_bylevel'], gamma=p['gamma'], learning_rate=p['learning_rate'], max_depth=p['max_depth'], objective=p['objective'], booster=p['booster'], min_child_weight=p['min_child_weight'], n_estimators=p['n_estimators'], reg_alpha=p['reg_alpha'], reg_lambda=p['reg_lambda'], eval_metric=p['eval_metric'] , subsample=p['subsample'], silent=1, n_jobs = -1, early_stopping_rounds = 100, random_state=7, nthread=-1)
            model.fit(x1, y1)
            score = np.sqrt(mean_squared_error(y2, model.predict(x2)))
            test['target'] += np.expm1(model.predict(test[col]))
        else:
            xg_valid = xgb.DMatrix(x2, label=y2)
            xg_train = xgb.DMatrix(x1, label=y1)
            model = xgb.train(params, xg_train, params['n_estimators'])
            score = np.sqrt(mean_squared_error(y2, model.predict(xg_valid)))
            test['target'] += np.expm1(model.predict(xg_test))

        print('Fold', fold, 'Score', score)
        full_score += score

    full_score /= folds
    print('Full score', full_score)

    test['target'] /= folds

    test.loc[test_ref.target > 0, 'target'] = test_ref[test_ref.target > 0].target.values

    test[['ID', 'target']].to_csv('subxgb.csv', index=False)

    explain=False
    #explain=True
    if explain and not use_regressor:
        print(eli5.format_as_text(eli5.explain_weights(model, top=200)))
Пример #24
0
def test_explain_linear_hashed_pos_neg(newsgroups_train, pass_feature_weights):
    docs, y, target_names = newsgroups_train
    # make it binary
    y = y.copy()
    y[y != 0] = 1
    target_names = [target_names[0], 'other']
    vec = HashingVectorizer(norm=None)
    ivec = InvertableHashingVectorizer(vec)

    clf = LogisticRegression(random_state=42)
    clf.fit(vec.fit_transform(docs), y)
    ivec.fit(docs)
    if pass_feature_weights:
        res = explain_weights(
            clf,
            top=(10, 10),
            target_names=target_names,
            feature_names=ivec.get_feature_names(always_signed=False),
            coef_scale=ivec.column_signs_)
    else:
        res = explain_weights(clf,
                              ivec,
                              top=(10, 10),
                              target_names=target_names)

    # HashingVectorizer with norm=None is "the same" as CountVectorizer,
    # so we can compare it and check that explanation is almost the same.
    count_vec = CountVectorizer()
    count_clf = LogisticRegression(random_state=42)
    count_clf.fit(count_vec.fit_transform(docs), y)
    count_res = explain_weights(count_clf,
                                vec=count_vec,
                                top=(10, 10),
                                target_names=target_names)

    for key in ['pos', 'neg']:
        values, count_values = [
            sorted(get_names_coefs(getattr(r.targets[0].feature_weights, key)))
            for r in [res, count_res]
        ]
        assert len(values) == len(count_values)
        for (name, coef), (count_name,
                           count_coef) in zip(values, count_values):
            assert name == count_name
            assert abs(coef - count_coef) < 0.05
Пример #25
0
def test_feature_importances_no_remaining(clf):
    """ Check that number of remaining features is shown if it is zero.
    """
    n = 100
    clf.fit(np.array([[i % 2 + 0.1 * np.random.random(), 0] for i in range(n)]),
            np.array([i % 2 for i in range(n)]))
    res = explain_weights(clf)
    for expl in format_as_all(res, clf):
        assert 'more features' not in expl and 'more &hellip;' not in expl
Пример #26
0
def test_explain_linear_regression_feature_filter(boston_train):
    clf = ElasticNet(random_state=42)
    X, y, feature_names = boston_train
    clf.fit(X, y)
    res = explain_weights(clf, feature_names=feature_names,
                          feature_re=re.compile('ratio$', re.I))
    for expl in format_as_all(res, clf):
        assert 'PTRATIO' in expl
        assert 'LSTAT' not in expl
Пример #27
0
def explain_weights_dfs(estimator, **kwargs):
    # type: (...) -> Dict[str, pd.DataFrame]
    """ Explain weights and export them to a dict with ``pandas.DataFrame``
    values (as :func:`eli5.formatters.as_dataframe.format_as_dataframes` does).
    All keyword arguments are passed to :func:`eli5.explain_weights`.
    Weights of all features are exported by default.
    """
    kwargs = _set_defaults(kwargs)
    return format_as_dataframes(eli5.explain_weights(estimator, **kwargs))
Пример #28
0
def test_sklearn_crfsuite_feature_re(xseq, yseq):
    crf = CRF(c1=0.0, c2=0.1, max_iterations=50)
    crf.fit([xseq], [yseq])

    expl = explain_weights(crf, feature_re=u'(солн|clean)')
    for expl in format_as_all(expl, crf):
        assert u'солн' in expl
        assert u'clean' in expl
        assert 'walk' not in expl
Пример #29
0
def interpret_model(model: Pipeline, all_features: list):
    log.debug("All features: {}".format(all_features))

    # Explain the model
    log.info(
        eli5.format_as_text(
            eli5.explain_weights(
                model.named_steps["model"],
                feature_names=all_features)))
Пример #30
0
 def show_feature_importance(self):
     print("\n\n\n\n+++++++++++++++++++++++++++++++")
     print('Calculating feature importance for model ', self.name, "...")
     perm = PermutationImportance(self.get_eli5_model(),
                                  random_state=1).fit(
                                      self.x_test, self.y_test)
     print(self.name, 'model feature importance')
     print(
         eli5.format_as_text(
             eli5.explain_weights(perm, feature_names=self.feature_names)))
Пример #31
0
    def explain_weights(self, **kwargs):
        """
        Call :func:`eli5.show_weights` for the locally-fit
        classification pipeline. Keyword arguments are passed
        to :func:`eli5.show_weights`.

        :func:`fit` must be called before using this method.
        """
        self._fix_target_names(kwargs)
        return eli5.explain_weights(self.clf_, vec=self.vec_, **kwargs)
Пример #32
0
def save_explanation(crf, out_path='crf-features.html'):
    print("Writing explanation to {} file".format(out_path))
    expl = eli5.explain_weights(crf, top=100)
    html = eli5.format_as_html(expl)
    Path(out_path).write_text("""<!DOCTYPE html>
    <html>
        <title>Contact Extraction Model</title>
        <head><meta charset="utf-8"></head>
        <body>{}</body>
    </html>
    """.format(html))