예제 #1
0
    def update_tree(change):
        with output:
            is_shows = [button.value for button in feature_buttons]
            show_features = np.array(feature_names)[is_shows]

            clf.fit(X_train[:, is_shows], y_train)

            if eval:
                y_pred = clf.predict(X_valid[:, is_shows])
                accuracy = accuracy_score(y_valid, y_pred)
            dtree = dtreeplt(model=clf, feature_names=show_features, target_names=target_names, X=X_train, y=y_train)
            clear_output()
            fig = dtree.view()
            if eval:
                fig.suptitle(f'Accuracy(Hold Out 9:1): {accuracy * 100:.3f}%', x=0, fontsize=20)
            plt.tight_layout()
            plt.show()
예제 #2
0
"""

# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',
                                    random_state=0,
                                    max_depth=3,
                                    min_samples_leaf=5)
clf = classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
#Accuracy = 91%
"""
#Rescaling my independent variables:
X_test = sc.inverse_transform(X_test)
"""

# Decision Tree visualization
from dtreeplt import dtreeplt
dtree = dtreeplt(model=classifier, feature_names=X_train, target_names=y_train)
fig = dtree.view()
fig
#if you want save figure, use savefig method in returned figure object.
fig.savefig('output_2.png')
def main():
    # import the hazelnut.csv file using panda
    df = pd.read_csv('mlAssignment2Dataset.csv', header=0, delimiter=",")

    # choose the number of training cases
    training_number = 10

    # find attribute names and class labels from the dataframe
    feature_names = df.columns.tolist()
    target_variable = df.columns[-1]
    target_names = df[target_variable].unique()

    # initialize variables
    average_accuracy = []
    clf_accuracy = []
    learning_curve_accuracy = []
    clf_learning_curve_accuracy = []

    # run the tests ten times and get the average accuracy
    for x in range(0, training_number):
        # randomly shuffle the data and split into training and test subsets
        df = df.sample(frac=1)
        train_split = int((len(df) * 2) / 3)
        training = df.values[0:train_split]
        test = df.values[train_split - 1:]
        # get the number of columns in the test and training sets
        training_length = training.shape[1]
        test_length = test.shape[1]

        # run the sklearn decision tree
        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(training[:, :test_length-1], training[:, training_length-1])

        # Test the imported decision tree algorithm
        test_predictions = clf.predict(test[:, :test_length-1])
        clf_accuracy.append((metrics.accuracy_score(test[:, -1], test_predictions))*100)
        clf_result = "\nScikit CART: test %d accuracy: %f%%" % (x+1, clf_accuracy[x])
        print(clf_result)

        # test the accuracy of our tree
        accuracy = test_tree(build_tree(training), test)
        average_accuracy.append(accuracy)
        result = "Our tree: Test %d has accuracy %d%%" % (x+1, accuracy)
        print(result)

    print("\nAverage accuracy for scikit tree after 10 runs is {:.2f}%, +/- {:.2f}%".format(mean(clf_accuracy), stdev(clf_accuracy)))
    print("Average accuracy after 10 runs is {:.2f}%, +/- {:.2f}%".format(numpy.mean(average_accuracy), stdev(average_accuracy)))

    # print out the sklearn decision tree
    dtree = dtreeplt(model=clf, feature_names=feature_names, target_names=target_names)
    fig = dtree.view()
    fig.savefig('output.png')

    # create a diagram of our decision tree
    final_string = cart_tree(build_tree(training), None)
    create_tree_image(final_string)

    # get data for learning curves and save to file to be used in excel
    for instances in range(2, len(training), 2):
        accuracy = test_tree(build_tree(training[0:instances]), test)
        learning_curve_accuracy.append(accuracy)

        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(training[0:instances, :training_length-1], training[0:instances, training_length-1])
        test_predictions = clf.predict(test[:, :test_length-1])
        clf_learning_curve_accuracy.append((metrics.accuracy_score(test[:, -1], test_predictions))*100)

    print(clf_learning_curve_accuracy)
    print(learning_curve_accuracy)
    newFile = open("mlAssignment2LearningCurve.csv", 'a+')
    newFile.write(str(learning_curve_accuracy))
    newFile.write(str(clf_learning_curve_accuracy))
    newFile.close()
from sklearn.tree import DecisionTreeClassifier
from dtreeplt import dtreeplt

#Creating a dataframe with the four feature variables
import pandas as pd

df = pd.read_csv('/home/deepak/analytics/Iris_Dataset.csv')

#View top 5 rows
df.head()

X = df.iloc[:, [0, 1, 2, 3]].values
y = df.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=1 / 3,
                                                    random_state=2)

model = DecisionTreeClassifier(criterion='entropy', random_state=0)
model.fit(X_train, y_train)

dtree = dtreeplt(model=model, feature_names=X_train, target_names=y_train)
fig = dtree.view()
#if you want save figure, use savefig method in returned figure object.
fig.savefig('Iris_output.png')
예제 #5
0
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

tree = DecisionTreeClassifier(criterion='gini', random_state=0)

tree.fit(X_train, y_train)

print('Accuracy for train {:.3f}'.format(tree.score(X_train, y_train)))
#학습 데이터를 가지고 만든 의사결정나무 모형이 학습 데이터를 얼마나 잘 나누는지 정확도를 보여준다.
print('Accuracy for test {:.3f}'.format(tree.score(X_test, y_test)))

dtree = dtreeplt(model=tree,
                 feature_names=X_train.columns,
                 target_names=['yes', 'no'])
#feature_names: 입력변수이 들어간다. target_names: 목표 변수의 이름이 들어간다.
fig = dtree.view()
fig

# In[7]:

tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=0)

tree.fit(X_train, y_train)

print('Accuracy for train {:.3f}'.format(tree.score(X_train, y_train)))
#학습 데이터를 가지고 만든 의사결정나무 모형이 학습 데이터를 얼마나 잘 나누는지 정확도를 보여준다.
print('Accuracy for test {:.3f}'.format(tree.score(X_test, y_test)))
#https://pypi.org/project/dtreeplt/
# https://github.com/nekoumei/dtreeplt
# You should prepare trained model,feature_names, target_names.
# in this example, use iris datasets.
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from dtreeplt import dtreeplt

iris = load_iris()
model = DecisionTreeClassifier()
model.fit(iris.data, iris.target)

dtree = dtreeplt(model=model,
                 feature_names=iris.feature_names,
                 target_names=iris.target_names)
fig = dtree.view()
#if you want save figure, use savefig method in returned figure object.
fig.savefig('output_test_Community.png')

# exploring the data
#https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# save load_iris() sklearn dataset to iris
# if you'd like to check dataset type use: type(load_iris())
# if you'd like to view list of attributes use: dir(load_iris())
iris = load_iris()

# np.c_ is the numpy concatenate function
    features[:, 11] = data['petroR50_r'] / data['petroR90_r']
    # concentration in z filter
    features[:, 12] = data['petroR50_z'] / data['petroR90_z']

    return features, targets


if __name__ == '__main__':
    data = np.load('galaxy_catalogue.npy')
    #filename="decision_tree.jpg"

    #  predicted_class, actual_class = dtc_predict_actual(data)
    training_set, testing_set = splitdata_train_test(data, 0.7)
    train_features, train_targets = generate_features_targets(training_set)
    test_features, test_targets = generate_features_targets(testing_set)
    dtc = DecisionTreeClassifier()
    dtc.fit(train_features, train_targets)
    #dot_data = export_graphviz(dtc, out_file=None,feature_names=['u - g', 'g - r', 'r - i', 'i - z','ecc','m4_u','m4_g','m4_r','m4_i','m4_z','conc1','conc2','conc3'])
    #graph = pydotplus.graph_from_dot_data(dot_data)
    #graph.write_jpg("decision_tree.jpg")
    #predictions= dtc.predict(test_features)
    dtree = dtreeplt(model=dtc,
                     feature_names=[
                         'u - g', 'g - r', 'r - i', 'i - z', 'ecc', 'm4_u',
                         'm4_g', 'm4_r', 'm4_i', 'm4_z', 'flux_u', 'flux_r',
                         'flux_z'
                     ],
                     target_names=['merger', 'elliptical', 'spiral'],
                     filled=True)
    fig = dtree.view()
    fig.savefig('DTC.png')
예제 #8
0
x1 = market2[[
    'Age', 'Gender', 'OwnHome', 'Married', 'Location', 'Salary', 'Children'
]]
y1 = market2['Spent']
x1_dummy = pd.get_dummies(x1, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(x1_dummy,
                                                    y1,
                                                    test_size=0.2,
                                                    random_state=None)
tree = DecisionTreeClassifier(criterion='gini', random_state=0)
tree.fit(X_train, y_train)

dtree = dtreeplt(
    model=tree,
    feature_names=X_train.columns,
    target_names=['1', '0'],
)
fig = dtree.view()

# #### (c) (b)에서 만들어진 의사결정나무 모형의 성과를 5겹 교차검증으로 측정하고, 5겹 교차검증의 정확도 평균값을 출력하라. (5점)
#

# In[229]:

from sklearn.model_selection import cross_val_score
import sklearn.metrics as sm
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for i in scoring:
    #tree1=DecisionTreeClassifier(criterion='entropy')
plot_decision_regions(
    X=X,
    y=y.astype(np.integer),
    clf=model,
    legend=None,
    #legend=2
    zoom_factor=3.0)

# Update plot object with X/Y axis labels and Figure Title
plt.xlabel('x1')
plt.ylabel('x2')
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.title('Solution Boundary', size=16)

plt.show()

from dtreeplt import dtreeplt
#plt.style.use('presentation')
dtree = dtreeplt(
    model=model,
    feature_names=['x0', 'x1'],
    target_names=[
        0, 1
    ]  #I think order in the dataset doesn't matter, just in ascending order???
)
fig = dtree.view()
fig.set_facecolor("lightslategray")
#if you want save figure, use savefig method in returned figure object.
fig.savefig('output_quiz.png')
                                     max_leaf_nodes=5,
                                     max_depth=2,
                                     min_samples_leaf=50)

# X_train=X_train.reshape(-1,1)

clf_entropy.fit(X_train, Y_train)

Y_pred = clf_entropy.predict(X_test)

print('Accuracy Score is:::', accuracy_score(Y_test, Y_pred) * 100)

df1 = df
del df1['not.fully.paid']
features = list(df1.columns)
from IPython.display import Image
from sklearn.externals.six import StringIO
import pydotplus
# dot_data=StringIO()
# features=list(df1.columns)
# tree.export_graphviz(clf_entropy,out_file=dot_data,rounded=True,filled=True,feature_names=features,impurity=False)
# graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())
from dtreeplt import dtreeplt

dtree = dtreeplt(model=df1, feature_names=features)
fig = dtree.view()

# df1=df.reshape(-1,1)
# dot_file=open("pt.dot",'w')
# sd=tree.export_graphviz(clf_entropy,out_file=dot_file,feature_names=df1.columns)