def run_model(training, testing, fields, labels): train_fields = fields.iloc[training].reset_index(drop = True) train_labels = [labels[i] for i in training] test_fields = fields.iloc[testing].reset_index(drop = True) test_labels = [labels[i] for i in testing] clf = CategoricalNB() clf.fit(train_fields, train_labels) res = clf.predict(test_fields).tolist() accuracy = [] for i in range(len(res)): if res[i] == 1 and test_labels[i] == 0: accuracy.append(1) elif res[i] == 0 and test_labels[i] == 1: accuracy.append(-1) else: accuracy.append(0) fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy) fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy) acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy) print("false positive rate: %4f" % fp) print("false negative rate: %4f" % fn) print("accuracy: %4f" % acc) return res, acc, fp, fn
def run_kfold(fields, labels): kf = KFold(n_splits=5) best = [], [] best_accuracy = 0 # train_index and test_index index into fields and labels for train_index, test_index in kf.split(fields): train_fields = fields.iloc[train_index].reset_index(drop = True) train_labels = [labels[i] for i in train_index] test_fields = fields.iloc[test_index].reset_index(drop = True) test_labels = [labels[i] for i in test_index] clf = CategoricalNB() clf.fit(train_fields, train_labels) try: res = clf.predict(test_fields).tolist() except IndexError: continue accuracy = [] for i in range(len(res)): if res[i] == test_labels[i]: accuracy.append(1) else: accuracy.append(0) accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))] acc = sum(accuracy)/len(accuracy) if (acc > best_accuracy): best = train_index, test_index best_accuracy = acc print("accuracy rate: ", acc) return best
def cnb(train_data, train_classes, test_data, test_classes): transformer = SimpleImputer(missing_values=-2, strategy=MISSING_STRATEGY) transformer.fit(test_data, test_classes) classifier = make_pipeline(transformer, CategoricalNB()) classifier.fit(train_data, train_classes.values.ravel()) predictions = classifier.predict(test_data) num_correct = 0.0 for i in range(len(test_classes)): if predictions[i] == test_classes.iat[i, 0]: num_correct += 1 accuracy = num_correct / len(test_classes) print("CategoricalNB classifier with no smoothing has accuracy " + str(accuracy)) best_accuracy = -1 best_laplace = -1 for laplace in range(1, 100): #decimal_laplace = (laplace*1.0) / 100 #print(decimal_laplace) transformer = SimpleImputer(missing_values=-2, strategy=MISSING_STRATEGY) classifier = make_pipeline(transformer, CategoricalNB(alpha=laplace)) classifier.fit(train_data, train_classes.values.ravel()) predictions = classifier.predict(test_data) num_correct = 0.0 for i in range(len(test_classes)): if predictions[i] == test_classes.iat[i, 0]: num_correct += 1 accuracy = num_correct / len(test_classes) if accuracy > best_accuracy: best_accuracy = accuracy best_laplace = laplace print("Best classifier with smoothing has alpha = " + str(best_laplace)) print("And accuracy " + str(best_accuracy))
def categoricalNaiveBayes(dtrain, dtest): # can use split? y_train = dtrain[:, -1] x_train = dtrain[:, :-1] scaler = preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) newscaler = preprocessing.MinMaxScaler() newscaler.fit(dtest) dtest = newscaler.transform(dtest) #print(x_train[x_train < 0]) gnb = CategoricalNB() gnb.fit(x_train, y_train) print("GNB Features") print("GNB cat count Features") print(gnb.category_count_) print("GNB class count Features") print(gnb.class_count_) print("GNB feature log prob Features") print(gnb.feature_log_prob_) print("GNB n Features") print(gnb.n_features_) print("Length test") print(len(dtest[0])) predictions = gnb.predict(dtest) return predictions
def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) nb = BernoulliNB(alpha=0.) assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = CategoricalNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[1., 0.], [0., 1.]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) nb = BernoulliNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) expected_msg = ('Smoothing parameter alpha = -1.0e-01. ' 'alpha should be > 0.') b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) c_nb = CategoricalNB(alpha=-0.1) assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y) assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y) assert_raise_message(ValueError, expected_msg, c_nb.fit, X, y) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) assert_raise_message(ValueError, expected_msg, b_nb.partial_fit, X, y, classes=[0, 1]) assert_raise_message(ValueError, expected_msg, m_nb.partial_fit, X, y, classes=[0, 1])
def test_categoricalnb_min_categories_errors(min_categories, error_msg): X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) with pytest.raises(ValueError, match=error_msg): clf.fit(X, y)
def setUp(self): rng = np.random.RandomState(1) self.X = rng.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) model = CategoricalNB() model.fit(self.X, y) self.model = model
def fit(self, x, y, **fit_params): batch_size = self._batch_size self._model = CategoricalNB(*self._args, **self._kwargs) for index in range(batch_size, x.shape[0] + batch_size, batch_size): self._model.partial_fit(x[index - batch_size:index, :].toarray(), y[index - batch_size:index], classes=self._classes) return self
def check_sklearn_dev(): """ This just verifies that sklearn 0.23-dev is installed properly by checking CategoricalNB results """ rng = np.random.RandomState(1) X = rng.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) clf = CategoricalNB() clf.fit(X, y) assert [3] == clf.predict(X[2:3])
def fit(self, X, Y): """ Fit the classifier to training data X and lables Y. Arguments: X (np.array): training data matrix of shape (n_samples, n_features) Y (np.array): label matrix of shape (n_samples, n_labels) """ n_labels = Y.shape[1] for idx in range(n_labels): Y_col = Y[:, idx] predictor = CategoricalNB() predictor.fit(X, Y_col) self.predictors.append(predictor)
def cnb(train_x, train_y, test_x, test_y): compnb = CategoricalNB() compnb.fit(train_x, train_y) y_predictions = compnb.predict(test_x) print("RMSE for Complement Naive Bayes model = ", mean_squared_error(test_y, y_predictions)) my_f1 = f1_score(test_y, y_predictions, average='macro') print("f1_macro for Categorical Naive Bayes Classifier = ", my_f1) cm = confusion_matrix(test_y, y_predictions, normalize='true') sns.heatmap(cm, annot=True) plt.title('Confusion matrix of the Categorical Naive Bayes classifier') plt.xlabel('Predicted') plt.ylabel('True') plt.savefig('./output/CompNB.png') plt.show()
def test_categoricalnb_with_min_categories( min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_ ): X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y_n_categories = np.array([1, 1, 2, 2]) expected_prediction = np.array([1]) clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) clf.fit(X_n_categories, y_n_categories) X1_count, X2_count = clf.category_count_ assert_array_equal(X1_count, exp_X1_count) assert_array_equal(X2_count, exp_X2_count) predictions = clf.predict(new_X) assert_array_equal(predictions, expected_prediction) assert_array_equal(clf.n_categories_, exp_n_categories_)
def NaiveBayes(xtrain, ytrain, xtest, ytest, binary=False): if binary: nb = GaussianNB() model = "GaussianNB" else: nb = CategoricalNB() model = "CategoricalNB" nb.fit(xtrain, ytrain) nb.predict(xtest) y_pred_nb = nb.predict(xtest) y_prob_pred_nb = nb.predict_proba(xtest) # how did our model perform? count_misclassified = (ytest != y_pred_nb).sum() print(model) print("=" * 30) print('Misclassified samples: {}'.format(count_misclassified)) accuracy = accuracy_score(ytest, y_pred_nb) print('Accuracy: {:.5f}'.format(accuracy)) heatmap_confmat(ytest, y_pred_nb, "naive_bayes.png") feature_importance_NB(nb, xtest, ytest) print("Naive Bayes done")
def find_best_feature(selected_features, X_train, C_param, y_train, feature): # decide on which features we are using (selected_features + feature) features_in_use = np.append(selected_features, feature) X_train_filt = X_train[:, features_in_use] X_train_filt_ranked = bin_rank(X_train_filt) # Fit a Logistic regression model lr = CategoricalNB(alpha=1.0, fit_prior=True, class_prior=None, min_categories=5).fit(X_train_filt_ranked, y_train) #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train) # Get the accuracy rate using the validation set accu_train = lr.score(X_train_filt_ranked, y_train) # Tuple format = (feature number, training accuracy found) return (feature, accu_train)
def buildReturnModel(self): model = None if self.flavor: if self.exp_type == 'classification': if self.flavor == 'Bernoulli': model = BernoulliNB(**self.default_args) elif self.flavor == 'Categorical': model = CategoricalNB(**self.default_args) elif self.flavor == 'Complement': model = ComplementNB(**self.default_args) elif self.flavor == 'Gaussian': model = GaussianNB(**self.default_args) elif self.flavor == 'Multinomial': model = MultinomialNB(**self.default_args) else: raise ValueError( 'Naive bayes can only be used for classification problems!' ) else: raise ValueError( 'cannot build model because the flavor of Naive Bayes is unknown!' ) return model
def bayes(test_set, training_set, categories): classifier = CategoricalNB() x, y = build_xy(training_set, categories) classifier.fit(x, y) false_positives = 0 false_negatives = 0 true_positives = 0 true_negatives = 0 x, y = build_xy(test_set, categories) y_predicted = classifier.predict(x) print(f'score: {classifier.score(x, y)}') print('bayes confusion matrix') print(classification_report(y, y_predicted))
def selected_feature_check(data, X_train, y_train, selected_features, selected_features_by_name, C_param, best_feature): # Save the previous selected features to check later if we have made a change prev_selected_features = selected_features.copy() prev_selected_features_by_name = selected_features_by_name.copy() # Add the best feature to the list selected_features = np.append(selected_features, best_feature) selected_features_by_name.append(data.columns[best_feature]) feature_removal_score = {} # Iterate through each feature in the list and remove it for feature in selected_features: temp_features = np.setdiff1d(selected_features, np.array([feature])) X_train_filt = X_train[:, temp_features] X_train_filt_ranked = bin_rank(X_train_filt) # Fit a Logistic regression model lr = CategoricalNB(alpha=1.0, fit_prior=True, class_prior=None, min_categories=5).fit(X_train_filt_ranked, y_train) #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train) # Get the accuracy rate using the validation set accu_train = lr.score(X_train_filt_ranked, y_train) feature_removal_score[feature] = accu_train # Get the feature which causes the highest accuracy without it max_key = max(feature_removal_score, key=lambda k: feature_removal_score[k]) selected_features = np.setdiff1d(selected_features, np.array([max_key])) max_key_name = data.columns[max_key] selected_features_by_name = list( set(selected_features_by_name) - set([max_key_name])) # Check if we have made any changes, if not let the caller know that this function is no longer needed if np.array_equal(selected_features, prev_selected_features): print( "-----------------------------------------------------------------------------------------------" ) print( "We have found the unchanged set, thus from now on we are just adding to our selected features.\n" ) print( "-----------------------------------------------------------------------------------------------" ) return max_key, selected_features, selected_features_by_name, True else: return max_key, selected_features, selected_features_by_name, False
def GetLearningAutomata(typeOfAutomata): if typeOfAutomata == "Categorical": return CategoricalNB() elif typeOfAutomata == "Gaussian": return GaussianNB() elif typeOfAutomata == "DecisionTree": return DecisionTreeClassifier() elif typeOfAutomata == "LinearSVC": return LinearSVC()
def naive_bayes_fit_and_predict(X_train, X_test, Y_train, Y_test): gnb, mnb, cnb, bnb, canb = GaussianNB(), MultinomialNB(), ComplementNB( ), BernoulliNB(), CategoricalNB() Y_pred_gnb = gnb.fit(X_train, Y_train).predict(X_test) Y_pred_mnb = mnb.fit(X_train, Y_train).predict(X_test) Y_pred_cnb = cnb.fit(X_train, Y_train).predict(X_test) Y_pred_bnb = bnb.fit(X_train, Y_train).predict(X_test) Y_pred_canb = canb.fit(X_train, Y_train).predict(X_test) return Y_pred_gnb, Y_pred_mnb, Y_pred_cnb, Y_pred_bnb, Y_pred_canb
def test_incremental_validation(X=None, y=None, iterations=10, verbose=1): if not X: X, y = make_classification(n_samples=500, n_features=1000, n_informative=20, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, class_sep=1, hypercube=False, scale=1.0, shuffle=True, random_state=0) X //= 10 # --> To be able to evaluate categoricalNB # classifiers nb_classifier = NaiveBayes(encode_data=True) nb_classifier_no_encoding = NaiveBayes(encode_data=False) custom_encoder = CustomOrdinalFeatureEncoder() cnb = CategoricalNB() # accumulators categorical_nb = [] custom_nb_val_1 = [] custom_nb_val_2 = [] custom_nb_val_3 = [] custom_nb_val_4 = [] for i in range(iterations): if verbose: print(f"Iteration {i}") ts = time() X2 = custom_encoder.fit_transform(X) ts = time() score_2 = nb_classifier.leave_one_out_cross_val(X, y) custom_nb_val_1.append(time() - ts) ts = time() score_4 = cross_leave_one_out(nb_classifier, X, y) custom_nb_val_3.append(time() - ts) ts = time() X2 = custom_encoder.fit_transform(X) score_5 = cross_leave_one_out(nb_classifier_no_encoding, X2, y) custom_nb_val_4.append(time() - ts) if i == 0: score_1 = score_2 scores = [score_1, score_2, score_4, score_5] assert all(score == scores[0] for score in scores) print("Categorical with scikit loo: ", np.mean(categorical_nb[1:])) print("Custom with scikit loo: ", np.mean(custom_nb_val_3[1:])) print("Custom with scikit loo (pre-encoding): ", np.mean(custom_nb_val_4[1:])) print("Custom with first incremental: ", np.mean(custom_nb_val_1[1:]))
class CategoricalBatchNB(TransformerMixin): def __init__(self, batch_size, classes, *args, **kwargs): self._batch_size = batch_size self._classes = classes self._args = args self._kwargs = kwargs self._model = CategoricalNB(*args, **kwargs) def fit(self, x, y, **fit_params): batch_size = self._batch_size self._model = CategoricalNB(*self._args, **self._kwargs) for index in tqdm(range(batch_size, x.shape[0] + batch_size, batch_size)): self._model.partial_fit( x[index - batch_size:index, :].toarray(), y[index - batch_size:index], classes=self._classes ) return self @staticmethod def transform(x, y=None, **fit_params): return x def predict(self, x): batch_size = self._batch_size predictions = [] for index in tqdm(range(batch_size, x.shape[0] + batch_size, batch_size)): predictions.extend( self._model.predict( x[index - batch_size:index, :].toarray() ).tolist() ) return np.array(predictions).ravel() def score(self, x, y): y_pred = self.predict(x) return accuracy_score(y, y_pred) def __str__(self): return "CategoricalBatchNB()" def __repr__(self): return self.__str__()
def test_predict(self, model, dummy_cat_X, dummy_cat_y): # reduce alpha to ensure no smoothing y = CategoricalNB(alpha=1.0e-10)\ .fit(dummy_cat_X, dummy_cat_y)\ .predict(dummy_cat_X) model.fit(dummy_cat_X, dummy_cat_y) y_hat = model.predict(dummy_cat_X) np.testing.assert_array_equal(y, y_hat)
def test_naive_bayes(): x, y = load_simple_data() logging.info(f"My Bayes 运行结果:") model = MyCategoricalNB(alpha=0) model.fit(x, y) logging.info(model.predict(np.array([[0, 1, 0]]), with_prob=True)) logging.info(f"CategoricalNB 运行结果:") model = CategoricalNB(alpha=0) model.fit(x, y) logging.info(model.predict(np.array([[0, 1, 0]]))) logging.info(model.predict_proba(np.array([[0, 1, 0]])))
def pengujian(): if "admin" not in session: return redirect(url_for("index")) mydb.connect() cursor = mydb.cursor() cursor.execute( "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019)'" ) training = cursor.fetchall() X = [[x[0], x[1], x[2], x[3], x[4]] for x in training] y = [x[5] for x in training] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) clf = CategoricalNB() clf.fit(X, y) cursor.execute( "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2020)'" ) testing = cursor.fetchall() X_test = [[x[0], x[1], x[2], x[3], x[4]] for x in testing] y_test = [x[5] for x in testing] predicted = clf.predict(X_test) payload = [] for index, x in enumerate(X_test): arr = x arr.append(y[index]) payload.append({ "no": index + 1, "stasiuntv": arr[0], "genre": arr[1], "writer": arr[2], "director": arr[3], "actor": arr[4], "status": arr[5], }) hasil = confusion_matrix(y_test, predicted) akurasi = (hasil[0][0] + hasil[1][1]) / (hasil[0][0] + hasil[0][1] + hasil[1][0] + hasil[1][1]) return render_template("pengujian.html", hasil=hasil, akurasi=round(akurasi * 100))
def test_predict_meta_override(): X = pd.DataFrame({"c_0": [1, 2, 3, 4]}) y = np.array([1, 2, 3, 4]) base = CategoricalNB() base.fit(pd.DataFrame(X), y) dd_X = dd.from_pandas(X, npartitions=2) dd_X._meta = pd.DataFrame({"c_0": [5]}) # Failure when not proving predict_meta # because of value dependent model wrap = ParallelPostFit(base) with pytest.raises(ValueError): wrap.predict(dd_X) # Success when providing meta over-ride wrap = ParallelPostFit(base, predict_meta=np.array([1])) result = wrap.predict(dd_X) expected = base.predict(X) assert_eq_ar(result, expected)
def example_weather_nominal(): path = (base_path / "weather-nominal.csv").resolve() series = pd.read_csv(path) # arrange table in X(features) and y(target) X = series.iloc[:, :-1] X = X.apply(LabelEncoder().fit_transform) y = series.iloc[:, -1] # apply GaussianNB and CategoricalNB gNB = GaussianNB() gNB.fit(X, y) cNB = CategoricalNB() cNB.fit(X, y) print( f"Prediction GaussianNB ([Sunny,Cool,High,True]]): {gNB.predict([[2,0,0,1]])}" ) print(f"Probability GaussianNB: {gNB.predict_proba([[2,0,0,1]])}") print("\n") print( f"Prediction CategoricalNB ([Sunny,Cool,High,True]]): {cNB.predict([[2, 0, 0, 1]])}" ) print(f"Probability CategoricalNB: {cNB.predict_proba([[2, 0, 0, 1]])}")
def example_weather_numeric(): path = (base_path / "weather-numeric.csv").resolve() series = pd.read_csv(path) # arrange table in X(features) and y(target) X = series.iloc[:, :-1] X.outlook = LabelEncoder().fit_transform(X.outlook) X.windy = LabelEncoder().fit_transform(X.windy) y = series.iloc[:, -1] # apply GaussianNB and CategoricalNB gNB = GaussianNB() gNB.fit(X, y) cNB = CategoricalNB() cNB.fit(X, y) print( f"Prediction GaussianNB ([Sunny,66,90,True]]]): {gNB.predict([[2, 66, 90, 1]])}" ) print(f"Probability GaussianNB: {gNB.predict_proba([[2, 66, 90, 1]])}") print("\n") print( f"Prediction CategoricalNB ([Sunny,66,90,True]]): {cNB.predict([[2, 66, 90, 1]])}" ) print(f"Probability CategoricalNB: {cNB.predict_proba([[2, 66, 90, 1]])}")
def test_spam_classification(): x_train, x_test, y_train, y_test = load_data() model = MyCategoricalNB(alpha=1.0) model.fit(x_train, y_train) y_pred = model.predict(x_test) logging.info(f"My Bayes 运行结果:") logging.info(classification_report(y_test, y_pred)) model = CategoricalNB() model.fit(x_train, y_train) y_pred = model.predict(x_test) logging.info(f"CategoricalNB 运行结果:") logging.info(classification_report(y_test, y_pred))
def classificationCategoricalNaiveBayes(): col_names = [ '*', 'web1', 'web2', 'cosine', 'len', 'word', 'sameDomain', 'label' ] #load dataset pima = pd.read_csv("data.csv", names=col_names) #split dataset in features and target variable feature_cols = ['cosine', 'len', 'word', 'sameDomain'] X = pima[feature_cols] # Features y = pima.label # Target variable #Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4) # 80% training and 20% test clf = CategoricalNB() clf.fit(X_train, y_train) # save the model dump(clf, open('model.pkl', 'wb')) startTime = datetime.now() #Predict the response for test dataset y_pred = clf.predict(X_test) endTime = datetime.now() print("exec time :", endTime - startTime) #Model Accuracy, how often is the classifier correct? print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) print("precision:", metrics.average_precision_score(y_test, y_pred)) print("recall:", metrics.recall_score(y_test, y_pred)) print() print(confusion_matrix(y_test, y_pred))
def setupClf(method, param): if method == 'knn': return KNeighborsClassifier(n_neighbors=param) elif method == 'bayes': return CategoricalNB(alpha=param) elif method == 'forest': return RandomForestClassifier(\ n_estimators=param[0],\ max_depth=param[1],\ min_samples_leaf=param[2],\ ccp_alpha=param[3]\ ) elif method == 'svm': return SVC(C=param)