Exemplo n.º 1
0
from sklearn.metrics import confusion_matrix
conf = confusion_matrix(test["Label"], y_pred["PredictedLabel"])
print(conf)

try:
    trees2 = rx_fast_trees(
        "Label ~ age + fnlwgt + educationnum + capitalgain + capitalloss + education",
        data=train)
except Exception as e:
    print(e)

trees2 = rx_fast_trees(
    "Label ~ age + fnlwgt + educationnum + capitalgain + capitalloss + education_cat",
    data=train,
    ml_transforms=[categorical(cols=dict(education_cat="education"))])
y_pred2 = rx_predict(trees2, test)
conf = confusion_matrix(test["Label"], y_pred2["PredictedLabel"])
print(conf)

cats = {}
for col in [
        "workclass", "education", "maritalstatus", "occupation",
        "relationship", "race", "sex", "nativecountry"
]:
    cats[col + "_cat"] = col

formula = "Label ~ age + fnlwgt + educationnum + capitalgain + capitalloss +" + \
          " + ".join(sorted(cats.keys()))

print(cats)
#
# To answer that question, we need to add the wine color
# as a new feature. Because it is a categorical feature, we
# need to convert it into a numerical one.
# We use the transform :epkg:`microsoftml:categorical`
# to convert column *color* into *color_num*.

from microsoftml import categorical
cols = list(wines.columns.drop(["quality",
                                "color"]))  # We still drop column color.
cols.append("color_num")  # But we add the new one.
model = rx_fast_trees(
    "quality ~" + "+".join(cols),
    data=wines_train,
    method="regression",
    ml_transforms=[categorical(cols=dict(color_num="color"))])
pred = rx_predict(model, wines_test, extra_vars_to_write=["quality"])
r2_color = r2_score(pred.quality, pred.Score)
print("R2 with colors=", r2_color)

#####################################
# Performance is not better. Let's confirm that with
# the feature importances.

feature_importance = [(k, v)
                      for k, v in model.summary_["keyValuePairs"].items()]

import numpy
fig, ax = plt.subplots(1, 1)
ind = numpy.arange(len(feature_importance))
ax.barh(ind, [f[1] for f in feature_importance], 0.35)