from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

# Load iris dataset
# Load dataset
df = pd.read_csv('../datasets/diabetes.csv')
X = df.drop('diabetes', axis=1).values
y = df['diabetes'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)


accuracy = dict()
roc_auc = dict()

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=None,
                              random_state=1)

bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)

tree = tree.fit(X_train, y_train)
y_test_pred = tree.predict(X_test)
y_pred_prob = tree.predict_proba(X_test)[:,1]
Exemplo n.º 2
0
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

# read the data from a csv file
music_data = pd.read_csv('weather.csv') 

# CLEAN THE TRAINING AND PREDICTING DATA
x = music_data.drop(columns=['output'])
y = music_data['output']

# create model
model = DecisionTreeClassifier()
model.fit(x,y)

# make predictions
trained_model = joblib.dump(model,'trained_model.joblib')
Exemplo n.º 3
0
    m, n = df1.shape
    X = df1.iloc[:, 0:n - 1]
    Y = df1["price"]
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)
    test = pd.concat([x_test, y_test], axis=1)
    # test.to_csv("./t.csv", sep=",", index=0)
    # from sklearn.preprocessing import StandardScaler
    #
    # ss = StandardScaler()
    # x_train = ss.fit_transform(x_train)
    # x_test = ss.transform(x_test)

    regressor = DecisionTreeClassifier(random_state=0)
    parameters = {'max_depth': range(10, 50)}
    scoring_fnc = make_scorer(accuracy_score)
    kfold = KFold(n_splits=10)
    grid = GridSearchCV(regressor, parameters, scoring_fnc, cv=kfold)
    grid = grid.fit(x_train, y_train.ravel())
    reg = grid.best_estimator_
    print('train score: %f' % grid.best_score_)
    print('best parameters:')
    for key in parameters.keys():
        print('%s: %d' % (key, reg.get_params()[key]))
    print('test score: %f' % reg.score(x_test, y_test))

    from sklearn.externals import joblib

    joblib.dump(grid, "./" + i[:-4] + ".m")
Exemplo n.º 4
0
X[:,7] = labelencoder_X.fit_transform(X[:,7])
X[:,8] = labelencoder_X.fit_transform(X[:,8])
X[:,9] = labelencoder_X.fit_transform(X[:,9])
X[:,10] = labelencoder_X.fit_transform(X[:,10])
X[:,11] = labelencoder_X.fit_transform(X[:,11])
X[:,12] = labelencoder_X.fit_transform(X[:,12])
X[:,13] = labelencoder_X.fit_transform(X[:,13])
X[:,14] = labelencoder_X.fit_transform(X[:,14])
X[:,15] = labelencoder_X.fit_transform(X[:,15])
X[:,16] = labelencoder_X.fit_transform(X[:,16])
X[:,17] = labelencoder_X.fit_transform(X[:,17])
X[:,18] = labelencoder_X.fit_transform(X[:,18])
X[:,19] = labelencoder_X.fit_transform(X[:,19])
X[:,20] = labelencoder_X.fit_transform(X[:,20])
X[:,21] = labelencoder_X.fit_transform(X[:,21])

features = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

X = df[features]
y = df['class']

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X, y)
data = tree.export_graphviz(dtree, out_file=None, feature_names=features)
graph = pydotplus.graph_from_dot_data(data)
graph.write_png('mydecisiontree.png')

img = pltimg.imread('mydecisiontree.png')
imgplot = plt.imshow(img)
plt.show()
Exemplo n.º 5
0
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)
def run_demo():
    ####################################################################################################################
    # Tip 1: Use make_column_transformer to apply different preprocessing to different columns                         #
    # NOTE: I'm not sure this works                                                                                    #
    ####################################################################################################################

    # Load data (loading Titanic dataset)
    data = pd.read_csv(
        'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
    )

    # Make Transformer
    preprocessing = make_column_transformer(
        (OneHotEncoder(), ['Pclass', 'Sex']), (SimpleImputer(), ['Age']),
        remainder='passthrough')

    # Fit-Transform data with transformer
    data = preprocessing.fit_transform(data)

    ####################################################################################################################
    # Tip 2: Use make_column_selector to change data types to different columns                                        #
    # NOTE: I'm not sure this works                                                                                    #
    ####################################################################################################################

    # Load data (loading Titanic dataset)
    data = pd.read_csv(
        'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
    )

    # Make Transformer
    preprocessing = make_column_transformer(
        (OneHotEncoder(), make_column_selector(dtype_include='object')),
        (SimpleImputer(), make_column_selector(dtype_include='int')),
        remainder='drop')

    # Fit-Transform data with transformer
    data = preprocessing.fit_transform(data)

    ####################################################################################################################
    # Tip 3: Use Pipeline. Pipeline chains together multiple preprocessing steps. The output of each step is used as   #
    # input to the next step, it makes it easy to apply the same preprocessing to Train and Test.                      #
    ####################################################################################################################

    # Load data (loading Titanic dataset)
    data = pd.read_csv(
        'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
    )

    # Set X and y
    X = data.drop('Survived', axis=1)
    y = data[['Survived']]

    # Split Train and Test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    # Set variables
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)
    imputer = SimpleImputer(add_indicator=True, verbose=1)
    scaler = StandardScaler()
    clf = DecisionTreeClassifier()

    # Make Transformer
    preprocessing = make_column_transformer((make_pipeline(imputer, scaler), [
        'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare'
    ]), (ohe, ['Pclass', 'Sex', 'Name']),
                                            remainder='passthrough')

    # Make pipeline
    pipe = make_pipeline(preprocessing, clf)

    # Fit model
    pipe.fit(X_train, y_train.values.ravel())
    print("Best score : %f" % pipe.score(X_test, y_test.values.ravel()))

    ####################################################################################################################
    # Tip 4: You can grid search an entire pipeline and fine optimal tuning parameters                                 #
    ####################################################################################################################

    # Load data (loading Titanic dataset)
    data = pd.read_csv(
        'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
    )

    # Set X and y
    X = data.drop('Survived', axis=1)
    y = data[['Survived']]

    # Set variables
    clf = LogisticRegression()
    ohe = OneHotEncoder()
    scaler = StandardScaler()
    imputer = SimpleImputer()

    # Make Transformer
    preprocessing = make_column_transformer((make_pipeline(imputer, scaler), [
        'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare'
    ]), (ohe, ['Sex']),
                                            remainder='drop')

    # Make pipeline
    pipe = make_pipeline(preprocessing, clf)

    # Set params for Grid Search
    params = {}
    params['logisticregression__C'] = [0.1, 0.2, 0.3]
    params['logisticregression__max_iter'] = [200, 500]

    # Run grid search
    grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
    grid.fit(X, y.values.ravel())

    print(grid.best_score_)
    print(grid.best)
Exemplo n.º 7
0
 def fit(self, x_data, y_data):
     """使用决策树进行训练"""
     self.estimator = DecisionTreeClassifier()
     self.estimator.fit(x_data, y_data)
     print("训练成功")
Exemplo n.º 8
0
    #Linear kernal has no need for gamma
    kpcas.append(('Linear K', 'lin_k', KernelPCA(n_components=2, kernel='linear')))
    kpcas.append(('RBF K', 'rbf_k',KernelPCA(n_components=2, kernel='rbf', gamma=gamma)))
    #kpcas.append(('Polynomial K', 'ply_k', KernelPCA(n_components=2, kernel='poly', gamma=gamma)))
    #kpcas.append(('Sigmoid K', 'sig_k', KernelPCA(n_components=2, kernel='sigmoid', gamma=gamma)))
    #kpcas.append(('Cosine K', 'cos_k',KernelPCA(n_components=2, kernel='cosine', gamma=gamma)))

    #Initiate models with default parameters

    models = []

    models.append(('Linear SVM', 'lin_svc', SVC(kernel='linear', probability=True)))
    models.append(('RBF Kernel SVM','rbf_svc', SVC(kernel='rbf', gamma=gamma, probability=True)))
    models.append(('K-Nearest Neighbour', 'knn', KNeighborsClassifier()))
    models.append(('Logistic Regression', 'log_reg', LogisticRegression()))
    models.append(('Decision Tree', 'dec_tree', DecisionTreeClassifier()))
    models.append(('Gaussian Naive Bayes', 'gnb', GaussianNB()))
    models.append(('Random Forest', 'rf', RandomForestClassifier()))
    models.append(('Gradient Boosting', 'gb', GradientBoostingClassifier()))

    #models.append(('PLS', PLSRegression())) # Scale=False as data already scaled.

    folds = 10

    cv = StratifiedKFold(n_splits=folds, random_state=10)

    # Declare KPCA kernels deployed

    kpca_kernels = []

    for kernel, abbreviation, kpca in kpcas:
Exemplo n.º 9
0
    yhat = naive_prediction(testX, value)
    # evaluate
    score = accuracy_score(testy, yhat)
    # summarize
    print('Naive=%d score=%.3f' % (value, score))

# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 trainX,
                                                 trainy,
                                                 cv=kfold,
                                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
X=df.drop(['Outcome'], axis=1)
y=df['Outcome']


# In[4]:


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)


# In[5]:


dec_cls=DecisionTreeClassifier(max_depth=5)


# In[6]:


dec_cls.fit(X_train,y_train)


# In[7]:


y_pred=dec_cls.predict(X_test)


# In[8]:
Exemplo n.º 11
0
    # Success
    print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    # Return the results
    return results


# Import the three supervised learning models from sklearn

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier


# TODO: Initialize the three models
clf_A = SVC()
clf_B = DecisionTreeClassifier(min_samples_split=20)
clf_C = AdaBoostClassifier()

# Calculate the number of samples for 1%, 10%, and 100% of the training data
# HINT: samples_100 is the entire training set i.e. len(y_train)
# HINT: samples_10 is 10% of samples_100
# HINT: samples_1 is 1% of samples_100
samples_100 = len(y_train)
samples_10 = len(y_train) // 10
samples_1 = len(y_train) // 100


# Collect results on the learners
results = {}
results = train_predict(clf_A, samples_1, X_train, y_train, X_test, y_test)
#summary of the model predicion
print(classification_report(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

#accuracy score of the model
from sklearn.metrics import accuracy_score
print('accuracy score :',accuracy_score(y_pred,y_test))

"""### **Decision Tree Classifier**"""

#Decision Tree Classifier
#importing the library
from sklearn.tree import DecisionTreeClassifier
#creating local variable classifier
classifier = DecisionTreeClassifier()
#Training the model
classifier.fit(X_train,y_train)

#predicting the value of Y
y_pred = classifier.predict(X_test)

#importing metrics for evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#summary of the model predicion
print(classification_report(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

#accuracy score of the model
Exemplo n.º 13
0
def DecisionTreeModel(feature_set):
    c = SklearnClassifier(DecisionTreeClassifier())
    accuracies = cross_validation(c, feature_set)
    print_metrics("Decision Tree", accuracies)
Exemplo n.º 14
0
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from chapter07_Adaboost import adaColic

# see: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
if __name__ == '__main__':
    Xtrain, ytrain = adaColic.loadDataSet('horseColicTraining2.txt')
    Xtest, ytest = adaColic.loadDataSet('horseColicTest2.txt')
    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                             algorithm='SAMME',
                             n_estimators=10)
    clf.fit(Xtrain, ytrain)
    predictions = clf.predict(Xtrain)
    errArr = np.mat(np.ones((len(Xtrain), 1)))
    print('training set error rate: %.3f%%' %
          (float(errArr[predictions != ytrain].sum()) / len(Xtrain) * 100.0))
    predictions = clf.predict(Xtest)
    errArr = np.mat(np.ones((len(Xtest), 1)))
    print('test set error rate: %.3f%%' %
          (float(errArr[predictions != ytest].sum()) / len(Xtest) * 100.0))
Exemplo n.º 15
0
df = df.sample(frac=1).reset_index(drop=True)
df['lbl'] = 1.0
df.loc[df['type']=='R', 'lbl'] = 0.0
df.drop('type', axis=1, inplace=True)
df.astype(np.float32, inplace=True)
feature_names = ['c' + str(i) for i in range(60)]
label_name =['lbl']

# section 2: prep train and test data
test_x = df[:70][feature_names].get_values()
test_y = df[:70][label_name].get_values().ravel()
train_x = df[70:][feature_names].get_values()
train_y = df[70:][label_name].get_values().ravel()

# section 3: take a look at performance of sklearn decision tree and randomforest
clf = DecisionTreeClassifier()
clf.fit(train_x, train_y)
print("Sklearn Decision Tree Classifier", clf.score(test_x, test_y))

rfclf = RandomForestClassifier(n_jobs=2)
rfclf.fit(train_x, train_y)
print("Sklearn Random Forest Classifier", rfclf.score(test_x, test_y))


# section 4: my first practice of random forest
m = 10
votes = [1/m] * m
num_train = len(train_x)
num_feat = len(train_x[0])

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier


raw_data_df = pd.read_csv("train_data.csv")
raw_class_df = pd.read_csv("train_class.csv")

start_time = time.time()
data_train, data_verif, class_train, class_verif = train_test_split(raw_data_df, 
                                                                    raw_class_df, 
                                                                    test_size = 0.3, 
                                                                    random_state = 2, 
                                                                    stratify = raw_class_df)

#data_verif, class_train, class_verif
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100)
clf.fit(data_train, ravel(class_train))

prediction = clf.predict(data_verif)

pred = clf.predict_proba(data_verif)

tn, fp, fn, tp = confusion_matrix(class_verif, prediction).ravel()
print("tn: ", tn, "fp: ", fp, "fn: ", fn, "tp: ", tp)
print("Confusion Matrix: \n" + str(confusion_matrix(class_verif, prediction)))
print ("Accuracy : " + str(accuracy_score(class_verif, prediction)*100))
print("Report : \n" + str(classification_report(class_verif, prediction)))

# keep probabilities for the positive outcome only
pred = pred[:, 1]
	print('\n4) Векторизация tf-idf')
	do_smth_with_model(steps=[('vect', CountVectorizer()),
							  ('tfidf', TfidfTransformer()),
							  ('classifier', MultinomialNB())])

	# Байес, векторизация tf-idf, fit_prior=False
	print('\nExtra Векторизация tf-idf, fit_prior=False')
	do_smth_with_model(steps=[('vect', CountVectorizer()),
							  ('tfidf', TfidfTransformer()),
							  ('classifier', MultinomialNB(fit_prior=False))])

	# Дерево принятий решений, tf-idf
	print('\nDecission Tree')
	pipeline, label_predicted = do_smth_with_model(steps=[('vect', CountVectorizer()),
							  ('tfidf', TfidfTransformer()),
							  ('classifier', DecisionTreeClassifier())])

	draw_learning_curve(pipeline)
	draw_roc_curve(label_predicted)
	print('Learning curve показывает, что при увеличении обучающих данных, cross-validation score может незначительно '
		  'улучшиться, training score при этом останется статичен')
	print('Судя по roc-curve, классификатор показывает высокие результаты, но у наивного байесы было лучше '
		  '(надо смотреть на наклон синей прямой). AUC-value хуже, чем у Байеса, лучше, чем у случайного леса ')

	# Случайный лес, tf-idf
	print('\nRandomForestClassifier')
	pipeline, label_predicted = do_smth_with_model(steps=[('vect', CountVectorizer()),
							  ('tfidf', TfidfTransformer()),
							  ('classifier', RandomForestClassifier())])

	draw_learning_curve(pipeline)
    bl1.update (ml6)
    '''

    _trainfeatures, _trainlabels, _testfeatures, _testlabels = split(bf1, bl1)

    #(features, labels) = adapt (bf1, bl1)
    (trainfeatures, trainlabels) = adapt (_trainfeatures, _trainlabels)
    (testfeatures, testlabels) = adapt (_testfeatures, _testlabels)

    #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#GaussianProcessClassifier(), ExtraTreesClassifier(n_estimators=120), AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())
    #models = (ExtraTreesClassifier(n_estimators=128, random_state=0),  AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())
    #models = (SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())

    #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())

    models = (RandomForestClassifier(n_estimators = 128, random_state=0), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())

    #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB())

    #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), )#GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB())

    #fsets = (FSET_FULL,FSET_NOICC, FSET_MIN, FSET_YYY_G, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
    #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YY, FSET_YYY):

    fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YYY)

    #fsets = (FSET_FULL, FSET_Y, FSET_YYY)

    #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
    #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
    #fsets = (FSET_FULL, FSET_G, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
Exemplo n.º 19
0
#Run cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=10, n_repeats=10)
gnbScores = cross_val_score(gnb, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Gaussian Naive Bayes Accuracy: %0.2f (+/- %0.2f)" %
      (gnbScores.mean(), gnbScores.std() * 2))

# In[24]:

#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='entropy',
                             max_depth=11,
                             random_state=150)
dtc = dtc.fit(X, y)

#Run cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=10, n_repeats=10)
dtcScores = cross_val_score(dtc, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print("Decision Tree Classifier Accuracy: %0.2f (+/- %0.2f)" %
      (dtcScores.mean(), dtcScores.std() * 2))

# In[32]:

#KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
Exemplo n.º 20
0
    def update(self):
        '''
        Decision Tree ML
        :return:
        '''

        #        ff_happiness["Happiness.Score"]
        self.list_corr_features = pd.DataFrame([])
        if self.feature0.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[0]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[0]]],
                    axis=1)

        if self.feature1.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[1]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[1]]],
                    axis=1)

        if self.feature2.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[2]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[2]]],
                    axis=1)

        if self.feature3.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[3]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[3]]],
                    axis=1)

        if self.feature4.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[4]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[4]]],
                    axis=1)

        if self.feature5.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[5]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[5]]],
                    axis=1)

        if self.feature6.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[6]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[6]]],
                    axis=1)

        if self.feature7.isChecked():
            if len(self.list_corr_features) == 0:
                self.list_corr_features = ff_happiness[features_list[7]]
            else:
                self.list_corr_features = pd.concat(
                    [self.list_corr_features, ff_happiness[features_list[7]]],
                    axis=1)

        vtest_per = float(self.txtPercentTest.text())
        vmax_depth = float(self.txtMaxDepth.text())

        self.ax1.clear()
        self.ax2.clear()
        self.ax3.clear()
        self.txtResults.clear()
        self.txtResults.setUndoRedoEnabled(False)

        vtest_per = vtest_per / 100

        X_dt = self.list_corr_features
        y_dt = ff_happiness["Happiness.Scale"]

        class_le = LabelEncoder()

        # fit and transform the class

        y_dt = class_le.fit_transform(y_dt)

        # split the dataset into train and test
        X_train, X_test, y_train, y_test = train_test_split(
            X_dt, y_dt, test_size=vtest_per, random_state=100)
        # perform training with entropy.
        # Decision tree with entropy
        self.clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                                  random_state=100,
                                                  max_depth=vmax_depth,
                                                  min_samples_leaf=5)

        # Performing training
        self.clf_entropy.fit(X_train, y_train)

        # predicton on test using entropy
        y_pred_entropy = self.clf_entropy.predict(X_test)

        # confusion matrix for entropy model

        conf_matrix = confusion_matrix(y_test, y_pred_entropy)

        # clasification report

        self.ff_class_rep = classification_report(y_test, y_pred_entropy)
        self.txtResults.appendPlainText(self.ff_class_rep)

        # accuracy score

        self.ff_accuracy_score = accuracy_score(y_test, y_pred_entropy) * 100
        self.txtAccuracy.setText(str(self.ff_accuracy_score))

        self.ax1.set_xlabel('Predicted label')
        self.ax1.set_ylabel('True label')

        class_names1 = ['', 'Happy', 'Med.Happy', 'Low.Happy', 'Not.Happy']

        self.ax1.matshow(conf_matrix, cmap=plt.cm.get_cmap('Blues', 14))
        self.ax1.set_yticklabels(class_names1)
        self.ax1.set_xticklabels(class_names1, rotation=90)

        for i in range(len(class_names)):
            for j in range(len(class_names)):
                y_pred_score = self.clf_entropy.predict_proba(X_test)
                self.ax1.text(j, i, str(conf_matrix[i][j]))

        self.fig.tight_layout()
        self.fig.canvas.draw_idle()

        #####################
        # End Graph 1
        #####################

        ##########################
        # Graph 1 -- ROC
        ##########################

        y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
        n_classes = y_test_bin.shape[1]

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(),
                                                  y_pred_score.ravel())

        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        lw = 2
        self.ax2.plot(fpr[2],
                      tpr[2],
                      color='darkorange',
                      lw=lw,
                      label='ROC curve (area = %0.2f)' % roc_auc[2])
        self.ax2.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        self.ax2.set_xlim([0.0, 1.0])
        self.ax2.set_ylim([0.0, 1.05])
        self.ax2.set_xlabel('False Positive Rate')
        self.ax2.set_ylabel('True Positive Rate')
        self.ax2.set_title('ROC Curve Decision Tree')
        self.ax2.legend(loc="lower right")

        self.fig2.tight_layout()
        self.fig2.canvas.draw_idle()
        #--------------------------------
        ### Graph 3 Roc Curve by class
        #---------------------------------
        str_classes = ['HP', 'MEH', 'LOH', 'NH']
        colors = cycle(['magenta', 'darkorange', 'green', 'blue'])
        for i, color in zip(range(n_classes), colors):
            self.ax3.plot(fpr[i],
                          tpr[i],
                          color=color,
                          lw=lw,
                          label='{0} (area = {1:0.2f})'
                          ''.format(str_classes[i], roc_auc[i]))

        self.ax3.plot([0, 1], [0, 1], 'k--', lw=lw)
        self.ax3.set_xlim([0.0, 1.0])
        self.ax3.set_ylim([0.0, 1.05])
        self.ax3.set_xlabel('False Positive Rate')
        self.ax3.set_ylabel('True Positive Rate')
        self.ax3.set_title('ROC Curve by Class')
        self.ax3.legend(loc="lower right")

        # show the plot
        self.fig3.tight_layout()
        self.fig3.canvas.draw_idle()
Exemplo n.º 21
0
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score

# Loading dataset
iris=datasets.load_iris()

#dataframe
df=pd.DataFrame(iris.data, columns=iris.feature_names)
print(df.head(5))

y=iris.target
#print(y)

#decision tree algorithm
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(df,y)
print('Decision Tree Classifer Created')

#PLOT 
from sklearn.tree import plot_tree
model_all_params = DecisionTreeClassifier().fit(iris.data, iris.target)
plt.figure(figsize = (20,10)) #set size
plot_tree(model_all_params, 
          filled=True      )
plt.show()

#accuracy
y_pred = dtree.predict(df)
print('\nAccuracy: {0:.4f}'.format(accuracy_score(y, y_pred)))
Exemplo n.º 22
0
# 删除Sex和Embarked
data.drop(["Sex", "Embarked"], inplace=True, axis=1)

# 将编码后的数据合并到原数据
newdata = pd.concat([data, data_Sex_df, data_Embarked_df], axis=1)
print(newdata)

# 划分特征与标签
X = newdata.iloc[:, newdata.columns != "Survived"]
y = newdata.iloc[:,newdata.columns == "Survived"]

# 划分训练集与测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y)

# 实例化模型
clf = DecisionTreeClassifier(random_state=666)

# 交叉验证得到最佳折数
cv_score = []
for i in range(2,10):
    score = cross_val_score(clf,X,y,cv=i).mean()
    cv_score.append(score)

best_cv = cv_score.index(max(cv_score)) + 2

# 网格搜索寻找最佳超参数
parameters = {"splitter":('best','random')
              ,"max_depth":[*range(1,5)]
              ,"min_samples_leaf":[*range(1,10)]
             }
Exemplo n.º 23
0
# Neural Network Classifier #
train_x, test_x, train_y, test_y = train_test_split(transformed_samples, y, test_size=0.5, random_state=42)

NN_clf = MLPClassifier(solver='lbfgs', max_iter=400,alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
NN_model = NN_clf.fit(train_x, train_y)
predicted = NN_model.predict(test_x)
print('The accuracy score for NN classifier is : ')
print (accuracy_score(test_y, predicted))

filename = 'LDA_NN_model.sav'
pickle.dump(NN_model, open(filename, 'wb'))  

# Decision tree Classifier #

Tree_clf = DecisionTreeClassifier(random_state = 0)
Tree_model = Tree_clf.fit(train_x, train_y)
predicted_Tree = Tree_model.predict(test_x)
print('The accuracy score for Tree classifier is : ')
print (accuracy_score(test_y, predicted_Tree))

filename1 = 'LDA_Tree_model.sav'
pickle.dump(Tree_model, open(filename1, 'wb'))

# Plotting the scatter plot of the new feature space #
class_mapping = {0 : 'normal',1 : 'DOS',2 : 'U2R',3 : 'R2L',4 : 'PROBE'}
for lab,marker,color in zip(
        range(0,5),('^', 's', 'o','*','D'),('blue', 'red', 'green','black','yellow')):

        plt.scatter(x=transformed_samples[:,0].real[y == lab],
                y=transformed_samples[:,1].real[y == lab],
targets=targets.astype('int64')

#Correlations
DataFrame(reduced_predictors).join(targets).corr()

#Split as training and testing
pred_train, pred_test, tar_train, tar_test  =   train_test_split(DataFrame(reduced_predictors), targets, test_size=.3)

pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape

#Build model on training data

#Using support vector machines
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
#classifier=ensemble.BaggingClassifier(DecisionTreeClassifier())
classifier=ensemble.AdaBoostClassifier(DecisionTreeClassifier())


classifier=classifier.fit(pred_train,tar_train)

predictions=classifier.predict(pred_test)

sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)
sklearn.metrics.classification_report(tar_test, predictions)

    def testGreadSearchCV(self):

        if self.ui.comboBoxKlasyfikatory.currentIndex() == 0:
            X = self.tabelaBazowa[self.tabelaBazowa.columns[0:48]].copy()
            y = self.tabelaBazowa[self.tabelaBazowa.columns[48]].copy()

            for index, row in y.iteritems():
                if row in 'DBScan euclidean':
                    y[index] = 11
                elif row in 'DBScan cityblock':
                    y[index] = 12
                elif row in 'DBScan cosine':
                    y[index] = 13
                elif row in 'KMeans euclidean':
                    y[index] = 21
                elif row in 'KMeans cityblock':
                    y[index] = 22
                elif row in 'KMeans cosine':
                    y[index] = 23
                elif row in 'Agglomerative euclidean':
                    y[index] = 31
                elif row in 'Agglomerative cityblock':
                    y[index] = 32
                elif row in 'Agglomerative cosine':
                    y[index] = 33

            y = y.astype(int)

            mlp = DecisionTreeClassifier()
            my_cv = LeaveOneOut()

            parametr_space = {}

            if self.czySilhouette:
                if self.ui.checkBoxCriterion.isChecked():
                    parametr_space.update({
                        'criterion':
                        [self.ui.lineEditCriterion_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'criterion': [
                            self.konfiguracja.loc['Silhouette_klasyfikacja',
                                                  'criterion']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'max_depth':
                        np.arange(self.ui.spinBoxMaxDepthOd.value(),
                                  self.ui.spinBoxMaxDepthDo.value(),
                                  self.ui.spinBoxMaxDepthCo.value())
                    })
                else:
                    parametr_space.update({
                        'max_depth': [
                            self.konfiguracja.loc['Silhouette_klasyfikacja',
                                                  'max_depth']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'random_state': [self.ui.spinBoxRandomState_2.value()]
                    })
                else:
                    parametr_space.update({
                        'random_state': [
                            self.konfiguracja.loc['Silhouette_klasyfikacja',
                                                  'random_state']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_leaf':
                        np.arange(self.ui.spinBoxMinSamplesLeafOd.value(),
                                  self.ui.spinBoxMinSamplesLeafDo.value(),
                                  self.ui.spinBoxMinSaplesLeafCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_leaf': [
                            self.konfiguracja.loc['Silhouette_klasyfikacja',
                                                  'min_samples_leaf']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_split':
                        np.arange(self.ui.spinBoxMinSamplesSplitOd.value(),
                                  self.ui.spinBoxMinSaplesSplitDo.value(),
                                  self.ui.spinBoxMinSamplesSplitCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_split': [
                            self.konfiguracja.loc['Silhouette_klasyfikacja',
                                                  'min_samples_split']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'splitter':
                        [self.ui.lineEditSpliter_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'splitter': [
                            self.konfiguracja.loc['Silhouette_klasyfikacja',
                                                  'splitter']
                        ]
                    })
            elif self.czyDaviesBouldin:
                if self.ui.checkBoxCriterion.isChecked():
                    parametr_space.update({
                        'criterion':
                        [self.ui.lineEditCriterion_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'criterion': [
                            self.konfiguracja.loc['DaviesBouldin_klasyfikacja',
                                                  'criterion']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'max_depth':
                        np.arange(self.ui.spinBoxMaxDepthOd.value(),
                                  self.ui.spinBoxMaxDepthDo.value(),
                                  self.ui.spinBoxMaxDepthCo.value())
                    })
                else:
                    parametr_space.update({
                        'max_depth': [
                            self.konfiguracja.loc['DaviesBouldin_klasyfikacja',
                                                  'max_depth']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'random_state': [self.ui.spinBoxRandomState_2.value()]
                    })
                else:
                    parametr_space.update({
                        'random_state': [
                            self.konfiguracja.loc['DaviesBouldin_klasyfikacja',
                                                  'random_state']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_leaf':
                        np.arange(self.ui.spinBoxMinSamplesLeafOd.value(),
                                  self.ui.spinBoxMinSamplesLeafDo.value(),
                                  self.ui.spinBoxMinSaplesLeafCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_leaf': [
                            self.konfiguracja.loc['DaviesBouldin_klasyfikacja',
                                                  'min_samples_leaf']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_split':
                        np.arange(self.ui.spinBoxMinSamplesSplitOd.value(),
                                  self.ui.spinBoxMinSaplesSplitDo.value(),
                                  self.ui.spinBoxMinSamplesSplitCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_split': [
                            self.konfiguracja.loc['DaviesBouldin_klasyfikacja',
                                                  'min_samples_split']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'splitter':
                        [self.ui.lineEditSpliter_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'splitter': [
                            self.konfiguracja.loc['DaviesBouldin_klasyfikacja',
                                                  'splitter']
                        ]
                    })

            elif self.czyCalinskiHarabasz:
                if self.ui.checkBoxCriterion.isChecked():
                    parametr_space.update({
                        'criterion':
                        [self.ui.lineEditCriterion_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'criterion': [
                            self.konfiguracja.loc[
                                'CalinskiHarabasz_klasyfikacja', 'criterion']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'max_depth':
                        np.arange(self.ui.spinBoxMaxDepthOd.value(),
                                  self.ui.spinBoxMaxDepthDo.value(),
                                  self.ui.spinBoxMaxDepthCo.value())
                    })
                else:
                    parametr_space.update({
                        'max_depth': [
                            self.konfiguracja.loc[
                                'CalinskiHarabasz_klasyfikacja', 'max_depth']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'random_state': [self.ui.spinBoxRandomState_2.value()]
                    })
                else:
                    parametr_space.update({
                        'random_state': [
                            self.konfiguracja.loc[
                                'CalinskiHarabasz_klasyfikacja',
                                'random_state']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_leaf':
                        np.arange(self.ui.spinBoxMinSamplesLeafOd.value(),
                                  self.ui.spinBoxMinSamplesLeafDo.value(),
                                  self.ui.spinBoxMinSaplesLeafCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_leaf': [
                            self.konfiguracja.loc[
                                'CalinskiHarabasz_klasyfikacja',
                                'min_samples_leaf']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_split':
                        np.arange(self.ui.spinBoxMinSamplesSplitOd.value(),
                                  self.ui.spinBoxMinSaplesSplitDo.value(),
                                  self.ui.spinBoxMinSamplesSplitCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_split': [
                            self.konfiguracja.loc[
                                'CalinskiHarabasz_klasyfikacja',
                                'min_samples_split']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'splitter':
                        [self.ui.lineEditSpliter_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'splitter': [
                            self.konfiguracja.loc[
                                'CalinskiHarabasz_klasyfikacja', 'splitter']
                        ]
                    })

            clf = GridSearchCV(mlp,
                               parametr_space,
                               n_jobs=-1,
                               cv=my_cv,
                               verbose=3)
            clf.fit(
                X,
                y,
            )
            print('Najlepsze parametry:\n', clf.best_params_)

        elif self.ui.comboBoxKlasyfikatory.currentIndex() == 1:
            X = self.tabelaBazowa[self.tabelaBazowa.columns[0:49]].copy()
            y = self.tabelaBazowa[self.tabelaBazowa.columns[49]].copy()

            temp = X[X.columns[-1]].copy()

            for index, row in temp.iteritems():
                if row in 'DBScan euclidean':
                    temp[index] = 11
                elif row in 'DBScan cityblock':
                    temp[index] = 12
                elif row in 'DBScan cosine':
                    temp[index] = 13
                elif row in 'KMeans euclidean':
                    temp[index] = 21
                elif row in 'KMeans cityblock':
                    temp[index] = 22
                elif row in 'KMeans cosine':
                    temp[index] = 23
                elif row in 'Agglomerative euclidean':
                    temp[index] = 31
                elif row in 'Agglomerative cityblock':
                    temp[index] = 32
                elif row in 'Agglomerative cosine':
                    temp[index] = 33

            X['Metoda_metryka'] = temp.copy()
            y = y.astype(int)

            mlp = DecisionTreeRegressor()
            my_cv = LeaveOneOut()

            parametr_space = {}

            if self.czySilhouette:
                if self.ui.checkBoxCriterion.isChecked():
                    parametr_space.update({
                        'criterion':
                        [self.ui.lineEditCriterion_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'criterion': [
                            self.konfiguracja.loc['Silhouette_regresja',
                                                  'criterion']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'max_depth':
                        np.arange(self.ui.spinBoxMaxDepthOd.value(),
                                  self.ui.spinBoxMaxDepthDo.value(),
                                  self.ui.spinBoxMaxDepthCo.value())
                    })
                else:
                    parametr_space.update({
                        'max_depth': [
                            self.konfiguracja.loc['Silhouette_regresja',
                                                  'max_depth']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'random_state': [self.ui.spinBoxRandomState_2.value()]
                    })
                else:
                    parametr_space.update({
                        'random_state': [
                            self.konfiguracja.loc['Silhouette_regresja',
                                                  'random_state']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_leaf':
                        np.arange(self.ui.spinBoxMinSamplesLeafOd.value(),
                                  self.ui.spinBoxMinSamplesLeafDo.value(),
                                  self.ui.spinBoxMinSaplesLeafCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_leaf': [
                            self.konfiguracja.loc['Silhouette_regresja',
                                                  'min_samples_leaf']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_split':
                        np.arange(self.ui.spinBoxMinSamplesSplitOd.value(),
                                  self.ui.spinBoxMinSaplesSplitDo.value(),
                                  self.ui.spinBoxMinSamplesSplitCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_split': [
                            self.konfiguracja.loc['Silhouette_regresja',
                                                  'min_samples_split']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'splitter':
                        [self.ui.lineEditSpliter_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'splitter': [
                            self.konfiguracja.loc['Silhouette_regresja',
                                                  'splitter']
                        ]
                    })

            elif self.czyDaviesBouldin:
                if self.ui.checkBoxCriterion.isChecked():
                    parametr_space.update({
                        'criterion':
                        [self.ui.lineEditCriterion_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'criterion': [
                            self.konfiguracja.loc['DaviesBouldin_regresja',
                                                  'criterion']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'max_depth':
                        np.arange(self.ui.spinBoxMaxDepthOd.value(),
                                  self.ui.spinBoxMaxDepthDo.value(),
                                  self.ui.spinBoxMaxDepthCo.value())
                    })
                else:
                    parametr_space.update({
                        'max_depth': [
                            self.konfiguracja.loc['DaviesBouldin_regresja',
                                                  'max_depth']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'random_state': [self.ui.spinBoxRandomState_2.value()]
                    })
                else:
                    parametr_space.update({
                        'random_state': [
                            self.konfiguracja.loc['DaviesBouldin_regresja',
                                                  'random_state']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_leaf':
                        np.arange(self.ui.spinBoxMinSamplesLeafOd.value(),
                                  self.ui.spinBoxMinSamplesLeafDo.value(),
                                  self.ui.spinBoxMinSaplesLeafCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_leaf': [
                            self.konfiguracja.loc['DaviesBouldin_regresja',
                                                  'min_samples_leaf']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_split':
                        np.arange(self.ui.spinBoxMinSamplesSplitOd.value(),
                                  self.ui.spinBoxMinSaplesSplitDo.value(),
                                  self.ui.spinBoxMinSamplesSplitCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_split': [
                            self.konfiguracja.loc['DaviesBouldin_regresja',
                                                  'min_samples_split']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'splitter':
                        [self.ui.lineEditSpliter_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'splitter': [
                            self.konfiguracja.loc['DaviesBouldin_regresja',
                                                  'splitter']
                        ]
                    })

            elif self.czyCalinskiHarabasz:
                if self.ui.checkBoxCriterion.isChecked():
                    parametr_space.update({
                        'criterion':
                        [self.ui.lineEditCriterion_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'criterion': [
                            self.konfiguracja.loc['CalinskiHarabasz_regresja',
                                                  'criterion']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'max_depth':
                        np.arange(self.ui.spinBoxMaxDepthOd.value(),
                                  self.ui.spinBoxMaxDepthDo.value(),
                                  self.ui.spinBoxMaxDepthCo.value())
                    })
                else:
                    parametr_space.update({
                        'max_depth': [
                            self.konfiguracja.loc['CalinskiHarabasz_regresja',
                                                  'max_depth']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'random_state': [self.ui.spinBoxRandomState_2.value()]
                    })
                else:
                    parametr_space.update({
                        'random_state': [
                            self.konfiguracja.loc['CalinskiHarabasz_regresja',
                                                  'random_state']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_leaf':
                        np.arange(self.ui.spinBoxMinSamplesLeafOd.value(),
                                  self.ui.spinBoxMinSamplesLeafDo.value(),
                                  self.ui.spinBoxMinSaplesLeafCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_leaf': [
                            self.konfiguracja.loc['CalinskiHarabasz_regresja',
                                                  'min_samples_leaf']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'min_samples_split':
                        np.arange(self.ui.spinBoxMinSamplesSplitOd.value(),
                                  self.ui.spinBoxMinSaplesSplitDo.value(),
                                  self.ui.spinBoxMinSamplesSplitCo.value())
                    })
                else:
                    parametr_space.update({
                        'min_samples_split': [
                            self.konfiguracja.loc['CalinskiHarabasz_regresja',
                                                  'min_samples_split']
                        ]
                    })

                if self.ui.checkBoxMaxDepth.isChecked():
                    parametr_space.update({
                        'splitter':
                        [self.ui.lineEditSpliter_2.text().split(',')][0]
                    })
                else:
                    parametr_space.update({
                        'splitter': [
                            self.konfiguracja.loc['CalinskiHarabasz_regresja',
                                                  'splitter']
                        ]
                    })

            clf = GridSearchCV(mlp,
                               parametr_space,
                               n_jobs=-1,
                               cv=my_cv,
                               verbose=3,
                               scoring='max_error')
            clf.fit(X, y)
            print('Najlepsze parametry:\n', clf.best_params_)
Exemplo n.º 26
0
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling - lr otomatik yapmıyor manuel yapmamız gerekiyor
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

#fitting classifier to training test
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

#predict test set results
y_pred = classifier.predict(x_test)

#making the confusion matrix = biz modelin predictive powerını görmüş olduk
from sklearn.metrics import confusion_matrix  #sol üst sağ alt doğru tahminler sağ üst sol alt yanlış tahminler
cm = confusion_matrix(y_test, y_pred)

#visualising test= gerçek sonuçları ve tahmin bölgelerini görmemizi sağlar
from matplotlib.colors import ListedColormap
X_set, y_set = x_test, y_test
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
Exemplo n.º 27
0
    After that, it's not our code anymore--it's yours!
"""

import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl",
                             "r"))

### first element is our labels, any added elements are predictor
### features. Keep this the same for the mini-project, but you'll
### have a different feature list when you do the final project.
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

### it's all yours from here forward!
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

features_train, features_test, labels_train, labels_test = \
 train_test_split(features, labels, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print 'Accuracy:', accuracy_score(pred, labels_test)
Exemplo n.º 28
0
def DT(X, y, train_size, data_name):
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size)

    # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py
    # Fit classification model
    dt = DecisionTreeClassifier()
    path = dt.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    fig, ax = plt.subplots()
    ax.plot(ccp_alphas[:-1],
            impurities[:-1],
            marker='o',
            drawstyle="steps-post")
    ax.set_xlabel("effective alpha")
    ax.set_ylabel("total impurity of leaves")
    ax.set_title("Total Impurity vs effective alpha for training set")

    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        clfs.append(clf)
    print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]))

    # %%
    # For the remainder of this example, we remove the last element in
    # ``clfs`` and ``ccp_alphas``, because it is the trivial tree with only one
    # node. Here we show that the number of nodes and tree depth decreases as alpha
    # increases.
    clfs = clfs[:-1]
    ccp_alphas = ccp_alphas[:-1]

    node_counts = [clf.tree_.node_count for clf in clfs]
    depth = [clf.tree_.max_depth for clf in clfs]
    fig, ax = plt.subplots(2, 1)
    ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
    ax[0].set_xlabel("alpha")
    ax[0].set_ylabel("number of nodes")
    ax[0].set_title("Number of nodes vs alpha")
    ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
    ax[1].set_xlabel("alpha")
    ax[1].set_ylabel("depth of tree")
    ax[1].set_title("Depth vs alpha")
    fig.tight_layout()

    # %%
    # Accuracy vs alpha for training and testing sets
    # ----------------------------------------------------
    # When ``ccp_alpha`` is set to zero and keeping the other default parameters
    # of :class:`DecisionTreeClassifier`, the tree overfits, leading to
    # a 100% training accuracy and 88% testing accuracy. As alpha increases, more
    # of the tree is pruned, thus creating a decision tree that generalizes better.
    # In this example, setting ``ccp_alpha=0.015`` maximizes the testing accuracy.
    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    fig, ax = plt.subplots()
    ax.set_xlabel("alpha")
    ax.set_ylabel("accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas,
            train_scores,
            marker='o',
            label="train",
            drawstyle="steps-post")
    ax.plot(ccp_alphas,
            test_scores,
            marker='o',
            label="test",
            drawstyle="steps-post")
    ax.legend()
    plt.show()
    # %%
    best_alpha = 0.040790348647614105
    # %%
    # Create CV training and test scores for various training set sizes
    train_sizes, train_scores, test_scores = learning_curve(
        DecisionTreeClassifier(ccp_alpha=best_alpha),
        X,
        y,
        # Number of folds in cross-validation
        cv=5,
        # Evaluation metric
        scoring='accuracy',
        # Use all computer cores
        n_jobs=-1,
        # 50 different sizes of the training set
        train_sizes=np.linspace(0.01, 1.0, 50))

    print(train_scores)
    # Create means and standard deviations of training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)

    # Create means and standard deviations of test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Draw lines
    plt.plot(train_sizes,
             train_mean,
             '--',
             color="#111111",
             label="Training score")
    plt.plot(train_sizes,
             test_mean,
             color="#111111",
             label="Cross-validation score")

    # Draw bands
    plt.fill_between(train_sizes,
                     train_mean - train_std,
                     train_mean + train_std,
                     color="#DDDDDD")
    plt.fill_between(train_sizes,
                     test_mean - test_std,
                     test_mean + test_std,
                     color="#DDDDDD")

    # Create plot
    plt.title("DT Learning Curve - {}".format(data_name))
    plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(
        loc="best")
    plt.tight_layout()
    plt.show()
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
iris = load_iris()
X = iris['data'][:, [2, 3]]
y = iris['target']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1,
                                                    stratify=y)
# We are not doing featrue scaling but it might be helpful

tree_model = DecisionTreeClassifier(criterion='gini',
                                    max_depth=4,
                                    random_state=1)

tree_model.fit(X_train, y_train)

X_comb = np.vstack((X_train, X_test))
y_comb = np.hstack((y_train, y_test))
plot_decision_region(X_comb, y_comb, tree_model, test_idx=range(105, 150))

plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.tight_layout()
plt.show()

from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_sc,y_train_sc)
pred_knn = knn.predict(X_test)
print(confusion_matrix(y_test, pred_knn))
print(classification_report(y_test, pred_knn))
print(accuracy_score(y_test, pred_knn))
knn.fit(X_train_all, y_train_all)
pred_all_knn = knn.predict(X_test_all)
sub_knn = pd.DataFrame()
sub_knn['PassengerId'] = df_test['PassengerId']
sub_knn['Survived'] = pred_all_knn
#sub_knn.to_csv('knn.csv',index=False)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
pred_dtree = dtree.predict(X_test)
print(classification_report(y_test,pred_dtree))
print(accuracy_score(y_test, pred_dtree))
dtree_2 = DecisionTreeClassifier(max_features=7 , max_depth=6,  min_samples_split=8)
dtree_2.fit(X_train,y_train)
pred_dtree_2 = dtree_2.predict(X_test)
print(classification_report(y_test, pred_dtree_2))
print(accuracy_score(y_test, pred_dtree_2))
dtree_2.fit(X_train_all, y_train_all)
pred_all_dtree2 = dtree_2.predict(X_test_all)


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=6, max_features=7)