예제 #1
0
    def classification(self, metric, folds, alphas, graph):
        size = 1.3 * self.report_width // 10

        models = {}
        models["K nearest neighbors classifier K2"]  = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]  = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10)        
        models["Decision tree classifier"]           = dtc()
        models["Logistic classifier"]                = logitc()
        models["SVM classifier with RBF kernel"]     = svc(gamma='scale')
        models["SVM classifier with linear kernel"]  = svc(kernel='linear')
        models["Gaussian naive bayes"]               = gnbc()
        models["Bernoulli naive bayes"]              = bnbc()
        models["SGD classifier"]                     = sgdc(max_iter=10000)
        models["Random forest classifier"]           = rfc(n_estimators=100)
        models["Gradient boosting classifier"]       = gbc()
        self.models = models

        print('\n')
        print(self.report_width * '*', '\n*')
        print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        for model_name in models:
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
        print(self.report_width * '*', '')
        report = pd.DataFrame({'Classifier': names, 'Score': results})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        display(report)
        print('\n')
        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0)
            plt.show()             
        return None
예제 #2
0
def classification(file, X, Y, x, y):
    # lab_enc = prep.LabelEncoder()
    # Y = lab_enc.fit_transform(Y)
    # y = lab_enc.fit_transform(y)
    param = []
    acc = []
    for i in it.product(n_neighbors, weights, algorithm, leaf_size):
        for m in metric:
            # print(*i,m)
            if m == 'minkowski':
                for j in p:
                    # print('p=',j)
                    knn = knnc(*i, p=j, metric=m)
                    knn.fit(X, Y)
                    acc.append(knn.score(x, y))
                    param.append([*i, m, j])
            else:
                knn = knnc(*i, metric=m)
                knn.fit(X, Y)
                acc.append(knn.score(x, y))
                param.append([*i, m])
    _results(file, acc, param)
    def classification(self, metric, folds, printt=True, graph=False):
        size = self.graph_width

        if len(self.y.iloc[:,0].unique()) > 2:
            struct = 'multiclass'
        else:
            struct = 'binary'

        # significant model setup differences should be list as different models
        models = {}
        models["Linear discriminant analysis"]          = ldac()
        models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean')
        models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan')
        models["K nearest neighbors classifier K2"]     = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]     = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"]    = knnc(n_neighbors=10)        
        models["Decision tree classifier"]              = dtc()
        models["Gaussian naive bayes"]                  = gnbc()
        models["Bernoulli naive bayes"]                 = bnbc(binarize=0.5)
        models["Multinomial naive bayes"]               = mnbc()
        models["SGD classifier"]                        = sgdc(max_iter=10000)
        models["Ridge classifier"]                      = rc()

        if len(self.Xt_train) < 10000:
            models["SVM classifier RBF"]                = svc(gamma='scale')
            models["SVM classifier Linear"]             = svc(kernel='linear')
            models["SVM classifier Poly"]               = svc(kernel='poly')

        if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5:
            models["Gradient boosting classifier"]      = gbc()
            models["Random forest classifier"]          = rfc(n_estimators=100)

        if struct == 'multiclass':
            models["Logistic classifier multinomial"]   = logitc(multi_class='multinomial', solver='lbfgs')
            models["Logistic classifier auto"]          = logitc(multi_class='auto')
            models["Logistic One vs Rest"]              = ovrc(logitc())
            models["Logistic One vs One"]               = ovoc(logitc())

        if struct == 'binary':
            models["Logistic classifier"]               = logitc(max_iter=2000)

        self.models = models

        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        et = []
        for model_name in models:
            start = time.time()
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
            et.append((time.time() - start))
            #print(model_name, time.time() - start)
        report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        report.reset_index(inplace=True, drop=True)
        self.report_performance = report

        if printt:
            print('\n')
            print(self.report_width * '*', '\n*')
            print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
            print(self.report_width * '*', '')
            print(report)
            print('\n')

        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0, bottom=0.25)
            self.graphs_model.append(fig)
            plt.show()             
        return None
예제 #4
0
train, test = train_test_split(data_mod, test_size=0.2)

# X = diabetes[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
# y = diabetes[['Outcome']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2)

print(data_mod.shape)
print(train.shape)
print(test.shape)

features = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Age',
    'Insulin', 'DiabetesPedigreeFunction'
]
target = 'Outcome'
classifiers = [knnc(), dtc(), SVC(gamma='auto'), SVC(kernel='linear'), gnb()]
classifier_names = [
    'K nearest neighbors', 'Decision Tree Classifier',
    'SVM classifier with RBF kernel', 'SVM classifier with linear kernel',
    'Gaussian Naive Bayes'
]

for clf, clf_name in zip(classifiers, classifier_names):
    cv_scores = cross_val_score(clf, train[features], train[target], cv=5)

    print(clf_name, ' mean accuracy: ', round(cv_scores.mean() * 100, 3),
          '% std: ', round(cv_scores.var() * 100, 3), '%')

final_model_smv_lin = SVC(kernel='linear',
                          probability=True).fit(train[features], train[target])
# final_model_gnb = gnb().fit(train[features], train[target])
예제 #5
0
]

target = ['defects']

# print(df)

# print(df.shape)
# print(train[features].shape)
# print(train[target].shape)
# print(train[target].values.ravel().shape)
# # print(test.shape)

# Y = train[target].values.reshape(train[target].shape[0])
# print(Y)

classifiers = [knnc(), dtc(), SVC(), SVC(kernel='linear'), gnb()]

classifier_names = [
    'K nearest neighbors', 'Decision Tree Classifier',
    'SVM classifier with RBF kernel', 'SVM classifier with linear kernel',
    'Gaussian Naive Bayes'
]

# for clf, clf_name in zip(classifiers, classifier_names):
#     cv_scores = cross_val_score(clf, train[features], train[target], cv=5)

#     print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), \
#     	'% std: ', round(cv_scores.var()*100, 3),'%')

# final_model_smv_lin = SVC(kernel='linear').fit(train[features], Y)
final_model_gnb = gnb().fit(train[features], train[target])