def classification(self, metric, folds, alphas, graph): size = 1.3 * self.report_width // 10 models = {} models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Logistic classifier"] = logitc() models["SVM classifier with RBF kernel"] = svc(gamma='scale') models["SVM classifier with linear kernel"] = svc(kernel='linear') models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Random forest classifier"] = rfc(n_estimators=100) models["Gradient boosting classifier"] = gbc() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] for model_name in models: cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Classifier': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def classification(file, X, Y, x, y): # lab_enc = prep.LabelEncoder() # Y = lab_enc.fit_transform(Y) # y = lab_enc.fit_transform(y) param = [] acc = [] for i in it.product(n_neighbors, weights, algorithm, leaf_size): for m in metric: # print(*i,m) if m == 'minkowski': for j in p: # print('p=',j) knn = knnc(*i, p=j, metric=m) knn.fit(X, Y) acc.append(knn.score(x, y)) param.append([*i, m, j]) else: knn = knnc(*i, metric=m) knn.fit(X, Y) acc.append(knn.score(x, y)) param.append([*i, m]) _results(file, acc, param)
def classification(self, metric, folds, printt=True, graph=False): size = self.graph_width if len(self.y.iloc[:,0].unique()) > 2: struct = 'multiclass' else: struct = 'binary' # significant model setup differences should be list as different models models = {} models["Linear discriminant analysis"] = ldac() models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean') models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan') models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc(binarize=0.5) models["Multinomial naive bayes"] = mnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Ridge classifier"] = rc() if len(self.Xt_train) < 10000: models["SVM classifier RBF"] = svc(gamma='scale') models["SVM classifier Linear"] = svc(kernel='linear') models["SVM classifier Poly"] = svc(kernel='poly') if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5: models["Gradient boosting classifier"] = gbc() models["Random forest classifier"] = rfc(n_estimators=100) if struct == 'multiclass': models["Logistic classifier multinomial"] = logitc(multi_class='multinomial', solver='lbfgs') models["Logistic classifier auto"] = logitc(multi_class='auto') models["Logistic One vs Rest"] = ovrc(logitc()) models["Logistic One vs One"] = ovoc(logitc()) if struct == 'binary': models["Logistic classifier"] = logitc(max_iter=2000) self.models = models kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] et = [] for model_name in models: start = time.time() cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan) results.append(cv_scores) names.append(model_name) et.append((time.time() - start)) #print(model_name, time.time() - start) report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) report.reset_index(inplace=True, drop=True) self.report_performance = report if printt: print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') print(self.report_width * '*', '') print(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0, bottom=0.25) self.graphs_model.append(fig) plt.show() return None
train, test = train_test_split(data_mod, test_size=0.2) # X = diabetes[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] # y = diabetes[['Outcome']] # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2) print(data_mod.shape) print(train.shape) print(test.shape) features = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Age', 'Insulin', 'DiabetesPedigreeFunction' ] target = 'Outcome' classifiers = [knnc(), dtc(), SVC(gamma='auto'), SVC(kernel='linear'), gnb()] classifier_names = [ 'K nearest neighbors', 'Decision Tree Classifier', 'SVM classifier with RBF kernel', 'SVM classifier with linear kernel', 'Gaussian Naive Bayes' ] for clf, clf_name in zip(classifiers, classifier_names): cv_scores = cross_val_score(clf, train[features], train[target], cv=5) print(clf_name, ' mean accuracy: ', round(cv_scores.mean() * 100, 3), '% std: ', round(cv_scores.var() * 100, 3), '%') final_model_smv_lin = SVC(kernel='linear', probability=True).fit(train[features], train[target]) # final_model_gnb = gnb().fit(train[features], train[target])
] target = ['defects'] # print(df) # print(df.shape) # print(train[features].shape) # print(train[target].shape) # print(train[target].values.ravel().shape) # # print(test.shape) # Y = train[target].values.reshape(train[target].shape[0]) # print(Y) classifiers = [knnc(), dtc(), SVC(), SVC(kernel='linear'), gnb()] classifier_names = [ 'K nearest neighbors', 'Decision Tree Classifier', 'SVM classifier with RBF kernel', 'SVM classifier with linear kernel', 'Gaussian Naive Bayes' ] # for clf, clf_name in zip(classifiers, classifier_names): # cv_scores = cross_val_score(clf, train[features], train[target], cv=5) # print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), \ # '% std: ', round(cv_scores.var()*100, 3),'%') # final_model_smv_lin = SVC(kernel='linear').fit(train[features], Y) final_model_gnb = gnb().fit(train[features], train[target])