def evaluate_partitions(keep_bin_edges, df_processed): """ This function evaluates a lightweight classifier according to the thresholds. Inputs are a list of bin-edges for the continuous target and the processed df. """ # initialize the empty lists accs = [] aucs = [] mccs = [] apcs = [] accs_control = [] aucs_control = [] mccs_control = [] apcs_control = [] threshs = [] bin_pct = [] # starting data percentile pct = 0.0 # binning parameters fixed - DO NOT CHANGE num_bins = 10 num_trials = 10 # sweep through all bin edges for bin_edge in keep_bin_edges: threshold = bin_edge # obtain the X,y matrices X, X_control, y = partition_data(df_processed, threshold) # starting data percentile pct += 1 / num_bins for trial in range(num_trials): # get the training, testing, and control data-sets x_train_idf, y_train, x_test_idf, y_test, x_control_idf = split_transform_data( X, X_control, y) # fit the classifier clf = ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False), y_train) # evaluate on test and control sets accs.append(clf.score(x_test_idf, y_test)) accs_control.append(clf.score(x_control_idf, y)) y_pred = clf.predict(x_test_idf) y_pred_cont = clf.predict(x_control_idf) mccs.append(mcc(y_test, y_pred)) mccs_control.append(mcc(y, y_pred_cont)) y_proba = clf.predict_proba(x_test_idf) y_cont_proba = clf.predict_proba(x_control_idf) aucs.append(roc_auc_score(y_test, y_proba[:, 1])) aucs_control.append(roc_auc_score(y, y_cont_proba[:, 1])) apcs.append(apscore(y_test, y_proba[:, 1])) apcs_control.append(apscore(y, y_cont_proba[:, 1])) threshs.append(threshold) bin_pct.append(pct) # populate into a df for downstream analysis df_eval = pd.DataFrame() df_eval['data percentile'] = bin_pct # data percentile df_eval['threshold'] = threshs # bin edge df_eval['test accuracy'] = accs # accuracy df_eval['test mcc'] = mccs # matthews correlation coefficient df_eval['test auc'] = aucs # roc-auc df_eval['test ap'] = apcs # average precision df_eval['control accuracy'] = accs_control df_eval['control mcc'] = mccs_control df_eval['control auc'] = aucs_control df_eval['control ap'] = apcs_control return df_eval, y_train) y_pred = clf_log.predict(test_x_vectors) clf_log.score(test_x_vectors, y_test) from sklearn.metrics import confusion_matrix confusion_matrix(y_test, y_pred) #using Naive byaes complementNB from sklearn.naive_bayes import ComplementNB #creating the classifier clf_compnb = ComplementNB() y_pred2 =, y_train).predict(test_x_vectors) confusion_matrix(y_test,y_pred2) clf_compnb.score(test_x_vectors, y_test)
X_train, X_test, y_train, y_test = train_test_split( word_vec, lyrics_sub['genre'], test_size=0.20, stratify=lyrics_sub['genre']) print("At CNB") #Create Model clf = ComplementNB(), y_train) pred = clf.predict(X_test) print("CNB Results") #Score Model print(clf.score(X_test, y_test)) print(confusion_matrix(y_test, pred)) print(classification_report(y_test, pred)) print("At XGB") #Create Model xgb = XGBClassifier(), y_train) pred = xgb.predict(X_test) print("XGB RESULTS") #Score Model print(accuracy_score(y_test, pred)) print(confusion_matrix(y_test, pred)) print(classification_report(y_test, pred))
#clf = RandomForestClassifier(verbose=0, random_state=42, n_estimators=100), y) test_y = [] for doc in test_freqs.keys(): if uid in solution[doc]: test_y.append(1) else: test_y.append(0) test_y = np.array(test_y) predictions = clf.predict(test_x) clf.score(test_x, test_y) true_pos = 0 false_pos = 0 false_neg = 0 true_neg = 0 for idx in range(len(test_y)): if predictions[idx] == 1 and test_y[idx] == 1: true_pos += 1 if predictions[idx] == 0 and test_y[idx] == 1: false_neg += 1 if predictions[idx] == 1 and test_y[idx] == 0: false_pos += 1 if predictions[idx] == 0 and test_y[idx] == 0: true_neg += 1
# keep the knn, it's the best knn = KNeighborsClassifier(), y_train) knn_preds = knn.predict(test_data_features) dump(knn, 'knn.joblib') cnb = ComplementNB(), y_train) cnb_preds = cnb.predict(test_data_features) # make df with all preds df = pd.DataFrame( list(zip(cnb_preds, lr_preds, knn_preds, y_test, x_test)), columns=['cnb_preds', 'lr_preds', 'knn_preds', 'category', 'document']) # save incorrect predictions in a df to look at lr_incorrect = df[df['lr_preds'] != df['category']].copy() knn_incorrect = df[df['knn_preds'] != df['category']].copy() cnb_incorrect = df[df['cnb_preds'] != df['category']].copy() # combine lr and knn incorrects two_incorrect = knn_incorrect[ knn_incorrect['lr_preds'] != knn_incorrect['category']].copy() all_incorrect = two_incorrect[ two_incorrect['cnb_preds'] != two_incorrect['category']].copy() print('knn score: ', knn.score(test_data_features, y_test)) print('log_reg score: ', log_reg.score(test_data_features, y_test)) print('ComplementNaiveBayes score: ', cnb.score(test_data_features, y_test))
Y = numpy.asarray(data[data.columns[-1]]) X = numpy.asarray(data[data.columns[0:-1]]) clf = tree.DecisionTreeClassifier(max_depth=4) GNB = GaussianNB() MNB = MultinomialNB() CNB = ComplementNB() print('clf') scores = cross_val_score(clf, X, Y, cv=5) print(scores), Y) print(clf.score(X, Y)) print('GNB') scores = cross_val_score(GNB, X, Y, cv=5) print(scores), Y) print(GNB.score(X, Y)) print('MNB') scores = cross_val_score(MNB, X, Y, cv=5) print(scores), Y) print(MNB.score(X, Y)) print('CNB') scores = cross_val_score(CNB, X, Y, cv=5) print(scores), Y) print(CNB.score(X, Y))
# Model Accuracy, how often is the classifier correct? #%% Naive Bayes from sklearn.naive_bayes import GaussianNB nb = GaussianNB(),y_train) from sklearn.naive_bayes import MultinomialNB clf1 = MultinomialNB(),y_train) from sklearn.naive_bayes import ComplementNB clf2 = ComplementNB(),y_train) print("\n","GaussianNB:",nb.score(x_test,y_test),"\n","MultinomialNB:",clf1.score(x_test,y_test),"\n","ComplementNB:",clf2.score(x_test,y_test)) # en uygunu accuracy i yüksek olduğu için GaussianNB seçildi predictionnb = nb.predict(x_test) y_prednb = nb.predict(x_test) print("Accuracy:",metrics.accuracy_score(y_test, y_prednb)) print( confusion_matrix(y_test,y_prednb)) print("GaussianNB") print(classification_report(y_test,y_prednb)) #%% Decision Tree from sklearn.tree import DecisionTreeClassifier dt = DecisionTreeClassifier(criterion="entropy", max_depth=None,min_samples_split=10,max_features=18,random_state=0) dt =,y_train) predictiondt = dt.predict(x_test) y_preddt = dt.predict(x_test) print("Accuracy:",metrics.accuracy_score(y_test, y_preddt))