示例#1
0
def test_plot_tree(pyplot):
    # mostly smoke tests
    # Check correctness of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3,
                                 min_samples_split=2,
                                 criterion="gini",
                                 random_state=2)
    clf.fit(X, y)

    # Test export code
    feature_names = ['first feat', 'sepal_width']
    nodes = plot_tree(clf, feature_names=feature_names)
    assert len(nodes) == 3
    assert nodes[0].get_text() == ("first feat <= 0.0\nentropy = 0.5\n"
                                   "samples = 6\nvalue = [3, 3]")
    assert nodes[1].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [3, 0]"
    assert nodes[2].get_text() == "entropy = 0.0\nsamples = 3\nvalue = [0, 3]"
示例#2
0
    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")

plt.figure()
clf = DecisionTreeClassifier().fit(iris.data, iris.target)
plot_tree(clf, filled=True)
plt.show()
示例#3
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# print(train_X)
# print(train_Y)
# print(test_X)
# print(test_Y)

from sklearn import tree
from sklearn.metrics import classification_report, plot_confusion_matrix

clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=40, random_state=0)
clf.fit(X_train, y_train)
y_pre = clf.predict(X_test)
# pre_train_Y = clf.predict(train_X)
tree.plot_tree(clf)
print(clf.score(X_test, y_test))
print(classification_report(y_test, y_pre, target_names=None))
# print(classification_report(train_Y, pre_train_Y, target_names=None))

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(clf, X_test, y_test,
                                 display_labels=None,
                                 cmap=plt.cm.Reds,
                                 normalize=normalize)
    disp.ax_.set_title(title)
示例#4
0
"""Graph the actual tree"""

!pip install -q graphviz
import graphviz
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree

fn=["age","gender","height","weight","ap_hi","ap_lo","cholesterol","gluc","smoke","alco","active","BMI","obesity_level"]
cn=['Positive', 'Negative']

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (30,30), dpi=300)
tree.plot_tree(dt,
               feature_names = fn, 
               class_names=cn,
               filled = True);

"""Import Naive Bayes Classifier and fit the data"""

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, Y);

"""Make a prediction"""

y_pred = model.predict(X_test)
y_pred

"""View the confusion matrix for the model"""
示例#5
0
##Plot ROC for Decision Tree Method(with YellowBrick)
##Manual Plotting
plt.style.use('seaborn-poster');
figure = plt.figure(figsize=(10, 6));   
ax = plt.subplot(1, 1, 1);
plot_roc(clf, X_val, y_val, "Decision Tree (test)", ax)
plot_roc(clf, X_train, y_train, "Decision Tree (train)", ax)
plt.legend(loc='lower right', fontsize=18);
plt.tight_layout();

##Model Visualization
from sklearn.tree import plot_tree

plt.figure(figsize=[20,10]);
plot_tree(clf, filled=True, feature_names = feature_names, label='root', fontsize=14)
plt.show();


##2.0_Model Selection_Grid Search_Naive Bayes(Tune Parameters)
from sklearn.naive_bayes import GaussianNB

parameters = {'var_smoothing': [0,1e-9,1e-8]}

gnb = GaussianNB()

NBgridsearch = GridSearchCV(gnb,parameters,cv=15,scoring='roc_auc',return_train_score=True,error_score=0.0,n_jobs=-1)
NBgridsearch.fit(X_train, y_train)

NBgridsearch.best_params_
NBgridsearch.best_score_
waveform_features = waveform_data.iloc[:, 0:21]
print(waveform_features)

waveform_labels = waveform_data.iloc[:, 21]
print(waveform_labels)

X_train, X_test, y_train, y_test = train_test_split(waveform_features, waveform_labels,
                                                    test_size=0.33, random_state=42)

tree_classifier = DecisionTreeClassifier(max_depth=5, random_state=1)
tree_classifier.fit(X=X_train, y=y_train)

y_prediction = tree_classifier.predict(X=X_test)

plt.figure(figsize=(75, 10), dpi=200)
tree.plot_tree(decision_tree=tree_classifier, rounded=True, filled=True, fontsize=12)

plt.savefig("Waveform_Decision_Tree")

print(classification_report(y_true=y_test, y_pred=y_prediction))

scores = cross_val_score(estimator=tree_classifier, X=X_train, y=y_train, scoring='accuracy',
                         cv=10)

print("Scores: ", scores)
print("Mean Scores: ", scores.mean())

y_prediction2 = tree_classifier.predict(X=X_train)

print(classification_report(y_true=y_train, y_pred=y_prediction2))
示例#7
0
    min_impurity_split=None, class_weight=None, presort=False)
    criterion: 如何对样本进行划分. 可选: 'gini' (the Gini impurity) 
               'entropy' (the information gain) 信息增益
    splitter: 当前选择哪个属性进行划分.  可选: 'best' / 'random'
    max_depth: 树的最大深度. None =>将节点展开至所有叶子都是纯净的, or 叶子节点数 少于最小节点
    min_samples_split: 拆分内部节点所需的最少样本数
    min_samples_leaf: 在叶节点处所需的最小样本数 (没看懂???)
    min_weight_fraction_leaf: 在所有叶节点处的权重总和中的最小加权分数 (还是没看懂???)
    max_features: 寻找最佳分割时要考虑的功能数量:int/float/'auto'/'sqrt'/'log2'/None
    max_leaf_nodes: 以该数字为最好的方式种植tree, None 即不考虑
    min_impurity_decrease: 如果节点分裂导致杂质减少大于或等于该值,则该节点将分裂
    min_impurity_split: 如果节点的杂质高于阈值,它将分裂
    class_weight: 与类有关的权重
    presort: 是否对数据进行预排序以加快寻找最佳拟合的速度
'''
iris = load_iris()
X1, y1 = iris.data, iris.target
clf1 = tree.DecisionTreeClassifier(random_state=0,
                                   criterion='entropy',
                                   max_depth=3)
clf1.fit(X1[:-1], y1[:-1])  # 拟合
print(clf1.apply([X1[-1]]))  # 返回样本被预测为的叶子的索引
print(clf1.decision_path([X1[-1]]))  # 返回样本在树中的决策路径
print(clf1.feature_importances_)  # 返回每个属性的重要程度
print(clf1.get_depth(), clf1.get_n_leaves())  # 返回树的深度, 叶子个数
print(clf1.predict([X1[-1]]),
      clf1.predict_proba([X1[-1]]))  # 预测样本类型, 以及每个类型可能的概率
tree.plot_tree(clf1)
r = tree.export.export_text(clf1, feature_names=iris['feature_names'])
print(r)  # 画树
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

model=DecisionTreeClassifier(criterion='entropy')

model.fit(X_train,y_train)

y_pred=model.predict(X_test)

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
accuracy_score(y_test,y_pred)
confusion_matrix(y_test,y_pred)
classification_report(y_test,y_pred)

from sklearn.tree import plot_tree
plot_tree(model)



########################################################################################################################################










示例#9
0
# to interpret the results.

# %%
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

max_leaf_nodes = 3
tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=0)
tree.fit(X, y)

# %% [markdown]
# `plot_tree` will allow us to visually check the set of rules learnt by
# the decision tree.

# %%
_ = plot_tree(tree)

# %%
# plot the decision function learned by the tree
plot_tree_decision_function(tree, X, y)

# %% [markdown]
# By allowing only 3 leaves in the tree, we get similar rules to the ones we
# designed by hand:
# * the persons younger than 28.5 year-old (X[0] < 28.5) will be considered in the class
#   earning `<= 50K`.
# * the persons older than 28.5 and working less than 40.5 hours-per-week (X[1] <= 40.5)
#   will be considered in the class earning `<= 50K`, while the persons working
#   above 40.5 hours-per-week, will be considered in the class
#   earning `> 50K`.
示例#10
0
test = test.drop([target] + extraExcludes, axis=1)
varSelected = ['TicketMeanT0mainWithDummies', 'Sex0mainWithDummies']
X = train[varSelected].fillna(0)
y = train_y

clf = DecisionTreeClassifier(max_leaf_nodes=1000,
                             random_state=0,
                             max_depth=100)

# Train Decision Tree Classifer
clf = clf.fit(X, y)

#Predict the response for test dataset
y_pred = clf.predict(X)
test_pred = clf.predict(test[varSelected].fillna(0))

train['predicted'] = y_pred
test['predicted'] = test_pred
score_test = metrics.roc_auc_score(test_pred, test_y)
score_train = metrics.accuracy_score(y, train['predicted'])
print(score_train, score_test)

from sklearn import tree
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=300)
tree.plot_tree(clf,
               feature_names=varSelected,
               class_names='TARGET',
               filled=True)
fig.savefig(baseLoc + 'imagename.png')
示例#11
0
tree_clf.fit(X_train, Y_train)

# In[15]:

Y_pred = tree_clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
# percentage of correct predictions
perc_correct = np.diag(cm).sum() / len(X_test)
perc_correct

# In[19]:

# our tree predicted correctly 64% of the test
# lets take a look at the tree
from sklearn.tree import plot_tree
plot_tree(tree_clf, max_depth=3, filled=True)

# In[20]:

# now its time to try and improve our tree
# with something called bagging

from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier()
bag_clf.fit(X_train, Y_train.ravel())

# In[21]:

Ybag_pred = bag_clf.predict(X_test)
cm_bag = confusion_matrix(Y_test, Ybag_pred)
# percentage of correct predictions
示例#12
0
    #plot the decision boundary #绘制决策树
    plt.subplot(2,3,pairidx + 1)

    x_min,x_max = x[:,0].min()-1,x[:,0].max()+1
    y_min,y_max = x[:,1].min()-1,x[:,1].max()+1
    xx,yy = np.meshgrid(np.arange(x_min,x_max,plot_step),
                        np.arange(y_min,y_max,plot_step))
    plt.tight_layout(h_pad=0.5,w_pad=0.5,pad=2.5)

    z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
    z = z.reshape(xx.shape)
    cs = plt.contourf(xx,yy,z,cmap=plt.cm.RdYlBu)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])

    #Plot the trainning points
    for i,color in zip(range(n_classes),plot_colors):
        idx = np.where(y==i)
        plt.scatter(x[idx,0],x[idx,1],c=color,label=iris.target_names[i],
                    cmap=plt.cm.RdYlBu,edgecolor='black',s=15)

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right')
plt.axis("tight")

plt.figure()
clf = DecisionTreeClassifier().fit(iris.data,iris.target)
plot_tree(clf,filled=True)
plt.savefig('D:/python/czpython/save_decision_tree.png')#只能保存一张图片
plt.show()
示例#13
0
from sklearn import tree
import pandas as pd
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

data = pd.read_csv('decision_tree_sample.csv')
feature_cols = [
    'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
    'modelyear'
]
Y = data.mpg.to_numpy()
data.drop(['mpg', 'maker'], axis=1, inplace=True)
X = data.to_numpy()
#create model
model = tree.DecisionTreeClassifier(criterion='gini')
#Train model
model.fit(X, Y)

plt.figure(figsize=(30, 10))
a = plot_tree(model,
              feature_names=feature_cols,
              class_names=['Bad', 'OK', 'Good'],
              filled=True,
              rounded=True,
              fontsize=5)

b = 1
plot_decision_regions(X_combined, y_combined, 
                      classifier=tree_model,
                      test_idx=range(105, 150))

plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_20.png', dpi=300)
plt.show()





tree.plot_tree(tree_model)
#plt.savefig('images/03_21_1.pdf')
plt.show()






dot_data = export_graphviz(tree_model,
                           filled=True, 
                           rounded=True,
                           class_names=['Setosa', 
                                        'Versicolor',
                                        'Virginica'],
                           feature_names=['petal length', 
示例#15
0
#Result Interpretation:
#Low MSE means that the predicted values are actually matching with the predicted values
#RMSE talks about how accurately the model fits the response. A low RMSE is an idicator of "good fit", which means the response is very close to the real values


# In[ ]:


#Decision Tree
from sklearn import tree
tree_obj = tree.DecisionTreeClassifier()
tree1 = tree_obj.fit(train, train_labels)

#Visualization of the tree
fig, ax = mlt.pyplot.subplots(figsize=(40,40))
tree.plot_tree(tree_obj, feature_names=liver_data.columns, class_names = True, filled=True, max_depth=4, fontsize=15)
mlt.pyplot.show()
mlt.pyplot.savefig('tree.png')


# In[ ]:


#Prediction
dec_prediction = tree1.predict(test)
comp2 = pd.DataFrame({'Actual': test_labels, 'Predicted':dec_prediction})
comp3 = comp2.head(25)
print('The comparison between test_labels and the prediction labels:\n {}'.format(comp3))

comp3.plot(kind='bar', figsize=(12,10))
mlt.pyplot.grid(which='major', linestyle='-', linewidth='0.5', color='black')
示例#16
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter
        from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
        from sklearn.linear_model import LogisticRegression, LinearRegression
        from sklearn import tree
        import matplotlib.pyplot as plt

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folter
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            clf = DecisionTreeClassifier(random_state=42,
                                         max_depth=self.params["tree_depth"])
            self.is_classifier = True

        else:
            clf = DecisionTreeRegressor(random_state=42,
                                        max_depth=self.params["tree_depth"])
            self.is_classifier = False

        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [
            orig_cols[col_count] for col_count in range(len(orig_cols))
            if (X_datatypes[col_count] == 'category') or (
                X_datatypes[col_count] == 'object')
        ]
        self.X_numeric = [
            item for item in orig_cols if item not in self.X_categorical
        ]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

        # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:

            X.loc[:, self.X_categorical] = X[self.X_categorical].fillna(
                "Missing").copy()
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(
                self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:
            X.loc[:, self.X_numeric] = X[self.X_numeric].fillna(-999).copy()

        # Fit the decision tree
        clf.fit(X, y)
        if self.is_classifier:
            yy = clf.predict_proba(X)

            p = np.round_(yy[:, 1], 5)
        else:
            yy = clf.predict(X)

            p = np.round_(yy, 5)

        self.leaf_categories = list(set(p))

        # Fit linear or logistic models to each leaf node
        model_array = {}
        equation_log = []
        for cat in self.leaf_categories:
            if self.is_classifier:
                if (np.mean(y[p == cat]) < 1) and (np.mean(y[p == cat]) > 0):

                    lm = LogisticRegression(random_state=42)

                    lm.fit(X[p == cat], y[p == cat])

                    model_array[cat] = lm
                    equation_log.append([[
                        int(round((1 - cat) * sum(p == cat))),
                        int(round(cat * sum(p == cat)))
                    ],
                                         sum(p == cat), lm.intercept_[0]] +
                                        list(lm.coef_[0]))
                else:
                    loggerinfo(logger, "No leaf fit")
                    model_array[cat] = "dt"
            else:
                try:
                    lm = LinearRegression()
                    lm.fit(X[p == cat], y[p == cat])

                    model_array[cat] = lm

                    equation_log.append(
                        [cat, sum(p == cat), lm.intercept_] + list(lm.coef_))
                except:
                    loggerinfo(logger, "No leaf fit")
                    model_array[cat] = "dt"

        # Save the leaf models
        pd.DataFrame(equation_log,
                     columns=['leaf value', 'number of samples', 'intercept'] +
                     list(X.columns)).to_csv(
                         os.path.join(tmp_folder, 'Leaf_model_coef.csv'))

        # Plot the decision tree
        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 8), dpi=1600)
        tree.plot_tree(clf, feature_names=list(X.columns))
        fig.savefig(os.path.join(tmp_folder, 'Decision_tree_plot.png'))

        importances = clf.feature_importances_
        loggerinfo(logger, str(importances))

        self.mean_target = np.array(sum(y) / len(y))

        model = [clf, model_array]
        # Set model properties
        self.set_model_properties(model=model,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
                          value=max_d,
                          step=1,
                          format=None,
                          key="ad")

modelo = DecisionTreeClassifier(max_depth=max_d)

result = classificação(modelo, target, preditores, df_treino, df_teste)
modelo = result["modelo"]
lista_modelos.append(result)

st.header("Visualização da árvore")

#plt.figure(figsize=(12,8))
plot_tree(modelo,
          feature_names=preditores,
          filled=True,
          class_names=["0", "1", "2", "3", "4"])

st.pyplot()

#
# Feature importance
#
st.header("Relevância das variáveis")

#modelo.feature_importances_
importancia = pd.Series(modelo.feature_importances_, index=preditores)
importancia = importancia.sort_values(ascending=True)
importancia.plot(kind="barh")
st.pyplot()
示例#18
0
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3490, random_state=101)


# In[49]:


c = tree.DecisionTreeClassifier()
c = c.fit(X_train, Y_train)


# In[50]:


get_ipython().run_line_magic('matplotlib', 'inline')
tree.plot_tree(c)


# In[51]:


Y_pred = c.predict(X_test)


# In[52]:


from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))
print(X_train.shape);
print(y_train.shape);
print("\r\n");
print(X_test.shape);
print(y_test.shape);
from sklearn.tree import DecisionTreeClassifier;

tree = DecisionTreeClassifier(criterion    =  'entropy',
                              max_depth    =  3,
                              random_state =  0 );
tree.fit(X_train, y_train)
from sklearn.tree import plot_tree;

plot_tree(tree,
          feature_names = cancer.feature_names,
          fontsize      = 7 )
from sklearn.metrics import accuracy_score;
y_pred_train = tree.predict(X_train);
print("Train Set Accuracy : ", accuracy_score(y_train, y_pred_train))
y_pred_test = tree.predict(X_test);
print("Test Set Accuracy  : ", accuracy_score(y_test, y_pred_test))
# GINI IMPURITY
tree_gin_d1 = DecisionTreeClassifier(criterion    =  'gini',
                                     max_depth    =  1,
                                     random_state =  0 );
tree_gin_d1.fit(X_train, y_train)

y_pred_train_gin_d1 = tree_gin_d1.predict(X_train);
y_pred_test_gin_d1  = tree_gin_d1.predict(X_test);
示例#20
0
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(75 / 80)
#Accuracy = 93.75%
"""
#Rescaling my independent variables:
X_test = sc.inverse_transform(X_test)
"""

# Decision Tree visualization-----------------
from sklearn import tree

#Simple Decision Tree
#tree.plot_tree(classifier)
#image is quite blurred

#Lets try to make decision tree more interpretable by adding 'class names' and filling colors.
#cn=['0','1']
#tree.plot_tree(classifier,class_names=cn,filled = True)
#Although the Decision tree shows class name & leafs are colred but still its view is blurred.

#Lets create a blank chart of desired size using matplotlib library and place our Decision tree there.
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=300)
cn = ['0', '1']
tree.plot_tree(classifier, class_names=cn, filled=True)

#if you want save figure, use savefig method in returned figure object.
fig.savefig('imagename2.png')
示例#21
0
X['Embarked'] = [0 if str(name) == "nan" else ord(name) for name in X['Embarked']]
X['Cabin'] = [0 if str(el) == "nan" else len(str(el)) for el in X['Cabin']]
X['Ticket'] = [ 0 if str(el) == "nan" else len(str(el)) for el in X['Ticket']]

def sumASCII(name):
    accum = 0
    for i in name:
        accum += ord(i)
    return accum

X['Name'] = [0 if str(name) == "nan" else sumASCII(str(name)) for name in X['Name']]
X['Sex'] = [0 if sex == 'male' else 1 for sex in X['Sex']]

# Just give up on these for now
# X = X.drop(['Ticket', 'Cabin', 'Embarked'], axis = 1)
X = X.fillna(0)

# train the model
clf = clf.fit(X, Y)

# run predictions on the data, using the model. They should mostly conform to the training values Y
y_pred = clf.predict(X)
fig = plt.figure()
_ = tree.plot_tree(clf, feature_names=X.columns, filled=True)
returnVal = metrics.precision_recall_fscore_support(Y, y_pred, average="macro")
print(returnVal)


fig.savefig("tree.png")
plt.show()
    print("Train index: \n", train_index)
    print("Test index: \n", test_index)
    i = i + 1

for train_index, test_index in kf_10.split(X):
    #print(train_index, test_index)
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model_rf = RandomForestClassifier(n_estimators=10,
                                      criterion='gini',
                                      random_state=42)
    model_rf.fit(X_train, y_train)  # training model/classifier
    y_pred = model_rf.predict(X_test)  # melakukan prediksi

    print("confusion matrix: \n", confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    print("=======================================================")

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=700)
tree.plot_tree(model_rf.estimators_[0], filled=True)
fig.savefig('rf_individualtree.png')

# export model to create API
pickle.dump(model_rf, open('model.pkl', 'wb'))

cek = [[31, 5.2, 6.8, 10.9, 4.2, 33]]
print("model rf")
print(model_rf.predict(cek))
示例#23
0
# plt.axis("tight")
#
# plt.figure()
# clf = DecisionTreeClassifier().fit(iris.data, iris.target)
# plot_tree(clf, filled=True)
# plt.show()

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

estimator = DecisionTreeClassifier(max_leaf_nodes=6, random_state=0)
plt.figure()
estimator.fit(X_train, y_train)
plot_tree(estimator, filled=True)
plt.show()



from sklearn.tree import _tree

def tree_to_code(tree, feature_names, target_names):
    tree_ = tree.tree_
    conditions  = []
    # classes = tree.classes_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    target_name = [
示例#24
0
model.fit(X_train, y_train)
print('Optimal parameters:', model.best_params_)
y_test_hat = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]


print('Model evaluation (Optimal Hyperparameters)')
print('Accuracy:')
print(metrics.accuracy_score(y_test, y_test_hat))
print('Classification report:')
print(metrics.classification_report(y_test, y_test_hat))


print('Confusion matrix (Optimal Hyperparameters)')
cm = ConfusionMatrix(y_test, y_test_hat)
print(cm)


cm.print_stats()
ax = cm.plot(backend='seaborn', annot=True, fmt='g')
ax.set_title('Confusion Matrix (Optimal Hyperparameters)')
plt.show()


print('ROC curve (Optimal Hyperparameters)')
plot_roc_curve(y_test, y_test_prob)

tree = model.best_estimator_
plot_tree(tree, filled=True)
plt.show()
示例#25
0
ax.plot(X_test, y_pred, linewidth=4, label="Decision tree")
_ = plt.legend()

# %% [markdown]
# We see that the decision tree model does not have a priori distribution for
# the data and we do not end-up with a straight line to regress flipper length
# and body mass.
#
# Instead, we observe that the predictions of the tree are piecewise constant.
# Indeed, our feature space was split into two partitions. We can check the
# tree structure to see what was the threshold found during the training.

# %%
from sklearn.tree import plot_tree

_ = plot_tree(tree, feature_names=data_columns)

# %% [markdown]
# The threshold for our feature (flipper length) is 202.5 mm. The predicted
# values on each side of the split are two constants: 3683.50 g and 5023.62 g.
# These values corresponds to the mean values of the training samples in each
# partition.
#
# In classification, we saw that increasing the depth of the tree allowed to
# get more complex decision boundary. We can check the effect of increasing the
# depth for decision tree in a regression setting.

# %%
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
示例#26
0
Y = data.values[:,15]
#xyz=data.values[180:181,:13]
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)


# In[88]:


clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
from sklearn import tree
clf = tree.DecisionTreeClassifier()

#Once trained, you can plot the tree with the plot_tree function:
%matplotlib inline
tree.plot_tree(clf_gini) 

from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(125,200))
_ = tree.plot_tree(clf_gini,filled=True)


# In[92]:


y_pred = clf_gini.predict(X_test)
#y_pred
print ("\nAcuraccy score ::: ",accuracy_score(y_test,y_pred)*100)

示例#27
0
if __name__ == '__main__':
    with open('lenses.txt', 'r') as fr:  #加载文件
        lenses = [inst.strip().split('\t') for inst in fr.readlines()]  #处理文件
    lenses_target = []  #提取每组数据的类别,保存在列表里
    for each in lenses:
        lenses_target.append(each[-1])
    print(lenses_target)

    lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']  #特征标签
    lenses_list = []  #保存lenses数据的临时列表
    lenses_dict = {}  #保存lenses数据的字典,用于生成pandas
    for each_label in lensesLabels:  #提取信息,生成字典
        for each in lenses:
            lenses_list.append(each[lensesLabels.index(each_label)])
        lenses_dict[each_label] = lenses_list
        lenses_list = []
    # print(lenses_dict)                                              #打印字典信息
    lenses_pd = pd.DataFrame(lenses_dict)  #生成pandas.DataFrame
    # print(lenses_pd)                                                #打印pandas.DataFrame
    le = LabelEncoder()  #创建LabelEncoder()对象,用于序列化
    for col in lenses_pd.columns:  #序列化
        lenses_pd[col] = le.fit_transform(lenses_pd[col])
    # print(lenses_pd)                                                #打印编码信息

    clf = tree.DecisionTreeClassifier(
        max_depth=4)  #创建DecisionTreeClassifier()类
    # clf = clf.fit(lenses_pd.values.tolist(), lenses_target)         #使用数据,构建决策树
    tree.plot_tree(clf.fit(lenses_pd.values.tolist(), lenses_target),
                   filled=True)
    plt.show()
X_test.shape

# In[186]:

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion="entropy", random_state=0)
classifier.fit(X_train, Y_train)

# In[187]:

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
#Use the pandas apply method to numerically encode our attrition target variable

# In[188]:

plt.figure(figsize=(10, 11))
plot_tree(classifier)
plt.show

# In[189]:

classifier.score(X_train, Y_train)

# In[190]:

classifier.score(X_test, Y_test)

# In[ ]:
示例#29
0
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.25)

model = KNeighborsClassifier()
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)
df = pd.DataFrame({'actual': ytest, 'ypred': ypred})
df
model.score(xtest, ytest)

mod = GaussianNB()
mod.fit(xtrain, ytrain)
mod.predict(xtest)
mod.score(xtest, ytest)

mod1 = SVC()
mod1.fit(xtrain, ytrain)
mod1.predict(xtest)
mod1.score(xtest, ytest)

mod2 = DecisionTreeClassifier(max_depth=2)
mod2.fit(xtrain, ytrain)
mod2.predict(xtest)
mod2.score(xtest, ytest)

mod3 = RandomForestClassifier(n_estimators=700)
mod3.fit(xtrain, ytrain)
mod3.predict(xtest)
mod3.score(xtest, ytest)

plot_tree(mod2.fit(xtrain, ytrain))
示例#30
0
x = tree_data.iloc[:,:5]
x
y = tree_data.iloc[:,5]
y

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)


dt = DecisionTreeClassifier()
model = dt.fit(x, y)

pred = model.predict(x)
acc=accuracy_score(y, pred)
acc # 1.0

tree.plot_tree(model)
#@@12

x_col = x.columns
x_col[2] #  'income'
x_col[1] # 'age'

export_graphviz(model, out_file='smoking_tree_graph.dot', feature_names=x_col)
#@@13

dt = DecisionTreeClassifier(criterion='entropy')
model = dt.fit(x, y)

pred = model.predict(x)
acc=accuracy_score(y, pred)
acc
示例#31
0
dtc = DecisionTreeClassifier()  # model
dtc.fit(X_train, y_train)  # fit

# Prévision sur les données de test
y_pred = dtc.predict(X_test)  # predict
dtc.score(X_test, y_test)  # first evaluation based on accuracy

# Matrice de confusion et métriques
confusion_matrice_tree = confusion_matrix(y_test, y_pred)
print("Arbre de décision")
print_metrics(y_test, y_pred)

#Visualisation de l'arbre méthode 3
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(dtc, filled=True, max_depth=5, fontsize=10)
"""5.2. Random Forest"""

from sklearn.ensemble import RandomForestClassifier

# Classification par une forêt aléatoire
rf = RandomForestClassifier(max_depth=10)  # model / construction
rf.fit(X_train, y_train)  # fit / apprentissage

# Prévision sur les données de test
y_pred = rf.predict(X_test)  # predict / prévision : booléen True/False
y_pred_proba = rf.predict_proba(
    X_test)[:, 1]  # predict : avec probabilité d'être True/False entre 0 et 1
# The predicted class probability is the fraction of samples of the same class in a leaf.

# Matrice de confusion et métriques
RandomForestClassifier(...)
print(clf.predict([[0, 0, 0, 0]]))

out = clf.predict(X)
correct = np.where(out == y)
acc = len(correct[0]) / len(y) * 100
print(acc)

count = 0
for i in range(n_estimators):
    count += clf.estimators_[i].tree_.node_count
    print(clf.estimators_[i].tree_.node_count)
print(count)
a = clf.estimators_[0]
#plt.figure(10,10)
tree.plot_tree(a)

count = 0
for i in range(n_estimators):
    count += clf.estimators_[i].tree_.node_count
    print(clf.estimators_[i].tree_.node_count)
print(count)

#%%
#save_name= 'rf9_nf_cv1.p'

save_name = '3c_rf6_nf_norm_k2_cv1.p'
result_dir = 'results/'
rf_model = joblib.load(result_dir + save_name, mmap_mode='r')
#model = pickle.load(open(result_dir+'train_'+save_name+'_best.pth', 'rb'))
#