def test_categorical_data_digits_all_negative(): digits = load_digits() X = digits['data'] y = digits['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) mixed_nb = MixedNB(categorical_features='all') with pytest.raises(ValueError): mixed_nb.fit(X, y)
def test_continuous_data_iris(): iris = load_iris() X = iris['data'] y = iris['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_pred = gaussian_nb.predict(X) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_pred = mixed_nb.predict(X) assert (mixed_nb_pred == gaussian_nb_pred).all()
def test_continuous_data_breast_cancer(): breast_cancer = load_breast_cancer() X = breast_cancer['data'] y = breast_cancer['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_score = gaussian_nb.score(X, y) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_score = mixed_nb.score(X, y) assert np.isclose(gaussian_nb_score, mixed_nb_score)
def test_continuous_data_wine(): wine = load_wine() X = wine['data'] y = wine['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_score = gaussian_nb.score(X, y) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_score = mixed_nb.score(X, y) assert np.isclose(gaussian_nb_score, mixed_nb_score)
def test_continuous_data_digits(): digits = load_digits() X = digits['data'] y = digits['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_score = gaussian_nb.score(X, y) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_score = mixed_nb.score(X, y) assert np.isclose(gaussian_nb_score, mixed_nb_score)
def test_categorical_data_digits(): digits = load_digits() X = digits['data'] y = digits['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb.score(X, y) mixed_nb = MixedNB(categorical_features='all', max_categories=np.repeat(17, 64)) mixed_nb.fit(X[:1440], y[:1440]) mixed_nb.score(X[:1440], y[:1440])
# ## MixedNB with digits dataset # ### using categorical naive bayes # Load the required modules import numpy as np from sklearn.datasets import load_digits from sklearn.naive_bayes import GaussianNB from mixed_naive_bayes import MixedNB # Load the digits dataset digits = load_digits() X = digits['data'] y = digits['target'] # Fit to `sklearn`'s GaussianNB gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_score = gaussian_nb.score(X, y) # Fit to our classifier mixed_nb = MixedNB(categorical_features='all', max_categories=np.repeat(17, 64)) mixed_nb.fit(X, y) mixed_nb_score = mixed_nb.score(X, y) print(gaussian_nb_score) print(mixed_nb_score)
print("Confusion Matrix:") print(result) result1 = classification_report(Y_test, y_pred) print("Classification Report:", ) print(result1) rheat = classification_report(Y_test, y_pred, output_dict=True) sns.heatmap(pd.DataFrame(rheat).iloc[:-1, :].T, annot=True) result2 = accuracy_score(Y_test, y_pred) print("Accuracy:", result2) pd.crosstab(Y_array, y_pred, rownames=["Actual Result"], colnames=["Predicted Result"]) from mixed_naive_bayes import MixedNB model = MixedNB(categorical_features=[0, 1]) nb = model.fit(X_train, np.ravel(Y_train)) y_pred = model.predict(X_test) print(y_pred) print(nb) Y_array = np.asarray(Y_test.Salary) result = confusion_matrix(Y_test, y_pred) print("Confusion Matrix:") print(result) result1 = classification_report(Y_test, y_pred) print("Classification Report:", ) print(result1) rheat = classification_report(Y_test, y_pred, output_dict=True) sns.heatmap(pd.DataFrame(rheat).iloc[:-1, :].T, annot=True) result2 = accuracy_score(Y_test, y_pred) print("Accuracy:", result2)
multinomial_pred = multinomial_nb.fit(Xtrain, ytrain).predict(Xtest) metrics.accuracy_score(ytest, multinomial_pred) # Multinomial NB model accuracy is 0.774966 metrics.confusion_matrix(ytest, multinomial_pred) # [[10891, 469], # [ 2920, 780]] ############# Model 3 using Mixed NB # here we will use Gaussian NB for continuous predictors and MUltinomial NB # for categorical predictors # first install it using - pip install MixedNB from mixed_naive_bayes import MixedNB salary_train_raw.dtypes mixed_model = MixedNB(categorical_features=[1, 2, 4, 5, 6, 7, 8, 12]) mixed_model_pred = mixed_model.fit(Xtrain, ytrain).predict(Xtest) metrics.accuracy_score(ytest, mixed_model_pred) # 0.8242 # we can see that the mixed model has highest accuracy score #################### Model 4 using Logistic regression from sklearn.linear_model import LogisticRegression sal_logreg = LogisticRegression() sal_logreg.fit(Xtrain, ytrain) logreg_pred = sal_logreg.predict(Xtest)
def test_input_wrong_dims_2(): clf = MixedNB() with pytest.raises(ValueError): clf.fit([[0, 1, 2]], [[0, 1]])
def test_input_y_not_encoded(): clf = MixedNB() with pytest.raises(ValueError): clf.fit([[1, 2], [2, 2], [3, 3]], [0, 8, 0])
def test_input_string_x(): clf = MixedNB() with pytest.raises(TypeError): clf.fit([['X'], ['y']], [0, 1])
def test_input_string_y(): clf = MixedNB() with pytest.raises(TypeError): clf.fit([[2], [1]], [0, '1'])
def test_categorical_data_simple(): X, y = load_example() mixed_nb = MixedNB([0, 1]) mixed_nb.fit(X, y) mixed_nb.score(X, y)
def test_input_param(): clf = MixedNB(alpha='l') with pytest.raises(TypeError): clf.fit([0, 1, 2], [0, 1, 0])
Run benchmarks on toy datasets provided by sklearn. This is to ensure our implementation of Gaussian Naive Bayes is the same as sklearn's. """ from sklearn.datasets import load_iris, load_digits, \ load_wine, load_breast_cancer from sklearn.naive_bayes import GaussianNB from mixed_naive_bayes import MixedNB for load_data in [load_iris, load_digits, load_wine, load_breast_cancer]: print(f"--- {''.join(load_data.__name__.split('_')[1:])} ---") dataset = load_data() X = dataset['data'] y = dataset['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_pred = gaussian_nb.predict(X) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_pred = mixed_nb.predict(X) print(f"GaussianNB: {gaussian_nb.score(X,y)}") print(f"MixedNB : {mixed_nb.score(X,y)}")
cat_train = X_train.iloc[:,len(ContinousVariable):] SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1) # Standardise continuous features in the test set cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:len(ContinousVariable)]), columns=(ContinousVariable)) cat_test = X_test.iloc[:,len(ContinousVariable):] SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1) ####################################### ### Model construction: Naive Bayes ### ####################################### # Define the classifier # Does not require hyperparameter tuning - just need to identify the categorical variables out = np.argwhere(X_train.columns.isin(list(X_train[Catergorical.columns]))).ravel() #out= list(complete_subset_features[Catergorical.columns]) best_clf = MixedNB(categorical_features= out) filename2 = Output_name+'Naive_Bayes_fitted_model.mod' with open( filename2, "wb") as file: pickle.dump(best_clf, file) print(out) # Fit model #best_clf.fit(SX_train.astype(float),y_train.astype(int)) # Train the model using the training sets #scores = cross_val_score(best_clf, SX_train.astype(float), y_train.astype(int), n_jobs=16, cv=StratifiedKFold(5)) #accuracy = scores.mean() #sd= (scores.std()) # 0.8330165782220578 +/- 0.029993214054194695 # Fit optimised model
# %% mnb = MultinomialNB() # %% mnb.fit(X,y) y_hat=mnb.predict(X) y_hat #%% X = df.iloc[:,1:] y = df.Credit_rating X # %% from mixed_naive_bayes import MixedNB mnb = MixedNB(categorical_features=[1,2,3,4]) # %% mnb.fit(X,y) # %% from sklearn.preprocessing import LabelEncoder lm = LabelEncoder() lm.fit([1,2,3]) # %% X_1 = X X_1.iloc[:,1:2] = lm.transform(X.iloc[:,1:2]) X_1.iloc[:,2:3] = lm.transform(X.iloc[:,2:3]) X_1.iloc[:,3:4] = lm.transform(X.iloc[:,3:4]) X_1.iloc[:,4:5] = lm.transform(X.iloc[:,4:5])
# Take the next 4 columns soil = np.argwhere(X_raw[:, -44:-40] == 1)[:, 1] soil = soil[:, np.newaxis] # Take last 40 columns wild = np.argwhere(X_raw[:, -40:] == 1)[:, 1] wild = wild[:, np.newaxis] # Concat X's and minus 1 from y # to make categories start from 0 X = np.hstack([quant, soil, wild]) y = y_raw - 1 # Assume all features are Gaussian (using `sklearn`'s library) clf = GaussianNB() clf.fit(X, y) # Assume all features are Gaussian (using our library) clf = MixedNB() clf.fit(X, y) # Assume features `10` and `11` are multinoulli (categorical) # and the rest Gaussian clf = MixedNB(categorical_features=[10, 11]) clf.fit(X, y) print(f"--- forest covertypes---") print(f"GaussianNB : {clf.score(X,y)}") print(f"MixedNB (G) : {clf.score(X,y)}") print(f"MixedNB (C+G): {clf.score(X,y)}")
pro2 = pd.read_csv("4-df_preprocessed.csv", sep=',', header=0) #Divide the dataset into y and X y = pro2['Severity'] X = pro2.iloc[:,1:] # Change categorical variables into numerical variables label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]] = X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]].apply(LabelEncoder().fit_transform) # Split the dataset into training dataset and test dataset X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.2,random_state=1234, stratify=y) # Build a Bayesian Classification Model and predict the type using the test data. gnb = MixedNB(categorical_features=[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]) gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) # Calculate the accuracy accuracy = gnb.score(X_test, y_test) print('Accuracy: {0:.2f}'.format(accuracy)) # Build a confusion matrix cm = metrics.confusion_matrix(y_test,y_pred) print(metrics.classification_report(y_test,y_pred)) #Bayesian for PCA dataset # Load dataset import numpy as np