예제 #1
0
def test_categorical_data_digits_all_negative():
    digits = load_digits()
    X = digits['data']
    y = digits['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)

    mixed_nb = MixedNB(categorical_features='all')
    with pytest.raises(ValueError):
        mixed_nb.fit(X, y)
예제 #2
0
def test_continuous_data_iris():
    iris = load_iris()
    X = iris['data']
    y = iris['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_pred = gaussian_nb.predict(X)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_pred = mixed_nb.predict(X)

    assert (mixed_nb_pred == gaussian_nb_pred).all()
예제 #3
0
def test_continuous_data_breast_cancer():
    breast_cancer = load_breast_cancer()
    X = breast_cancer['data']
    y = breast_cancer['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_score = gaussian_nb.score(X, y)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_score = mixed_nb.score(X, y)

    assert np.isclose(gaussian_nb_score, mixed_nb_score)
예제 #4
0
def test_continuous_data_wine():
    wine = load_wine()
    X = wine['data']
    y = wine['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_score = gaussian_nb.score(X, y)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_score = mixed_nb.score(X, y)

    assert np.isclose(gaussian_nb_score, mixed_nb_score)
예제 #5
0
def test_continuous_data_digits():
    digits = load_digits()
    X = digits['data']
    y = digits['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_score = gaussian_nb.score(X, y)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_score = mixed_nb.score(X, y)

    assert np.isclose(gaussian_nb_score, mixed_nb_score)
예제 #6
0
def test_categorical_data_digits():
    digits = load_digits()
    X = digits['data']
    y = digits['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb.score(X, y)

    mixed_nb = MixedNB(categorical_features='all',
                       max_categories=np.repeat(17, 64))
    mixed_nb.fit(X[:1440], y[:1440])
    mixed_nb.score(X[:1440], y[:1440])
예제 #7
0
# ## MixedNB with digits dataset
# ### using categorical naive bayes

# Load the required modules
import numpy as np
from sklearn.datasets import load_digits
from sklearn.naive_bayes import GaussianNB
from mixed_naive_bayes import MixedNB

# Load the digits dataset
digits = load_digits()
X = digits['data']
y = digits['target']

# Fit to `sklearn`'s GaussianNB
gaussian_nb = GaussianNB()
gaussian_nb.fit(X, y)
gaussian_nb_score = gaussian_nb.score(X, y)

# Fit to our classifier
mixed_nb = MixedNB(categorical_features='all',
                   max_categories=np.repeat(17, 64))
mixed_nb.fit(X, y)
mixed_nb_score = mixed_nb.score(X, y)

print(gaussian_nb_score)
print(mixed_nb_score)
print("Confusion Matrix:")
print(result)
result1 = classification_report(Y_test, y_pred)
print("Classification Report:", )
print(result1)
rheat = classification_report(Y_test, y_pred, output_dict=True)
sns.heatmap(pd.DataFrame(rheat).iloc[:-1, :].T, annot=True)
result2 = accuracy_score(Y_test, y_pred)
print("Accuracy:", result2)
pd.crosstab(Y_array,
            y_pred,
            rownames=["Actual Result"],
            colnames=["Predicted Result"])

from mixed_naive_bayes import MixedNB
model = MixedNB(categorical_features=[0, 1])
nb = model.fit(X_train, np.ravel(Y_train))
y_pred = model.predict(X_test)
print(y_pred)
print(nb)
Y_array = np.asarray(Y_test.Salary)
result = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(Y_test, y_pred)
print("Classification Report:", )
print(result1)
rheat = classification_report(Y_test, y_pred, output_dict=True)
sns.heatmap(pd.DataFrame(rheat).iloc[:-1, :].T, annot=True)
result2 = accuracy_score(Y_test, y_pred)
print("Accuracy:", result2)
multinomial_pred = multinomial_nb.fit(Xtrain, ytrain).predict(Xtest)

metrics.accuracy_score(ytest, multinomial_pred)
# Multinomial NB model accuracy is 0.774966

metrics.confusion_matrix(ytest, multinomial_pred)
#    [[10891,   469],
#    [ 2920,   780]]

############# Model 3 using Mixed NB
# here we will use Gaussian NB for continuous predictors and MUltinomial NB
# for categorical predictors
# first install it using - pip install MixedNB
from mixed_naive_bayes import MixedNB
salary_train_raw.dtypes
mixed_model = MixedNB(categorical_features=[1, 2, 4, 5, 6, 7, 8, 12])
mixed_model_pred = mixed_model.fit(Xtrain, ytrain).predict(Xtest)

metrics.accuracy_score(ytest, mixed_model_pred)
# 0.8242

# we can see that the mixed model has highest accuracy score

#################### Model 4 using Logistic regression

from sklearn.linear_model import LogisticRegression
sal_logreg = LogisticRegression()
sal_logreg.fit(Xtrain, ytrain)

logreg_pred = sal_logreg.predict(Xtest)
예제 #10
0
def test_input_wrong_dims_2():
    clf = MixedNB()
    with pytest.raises(ValueError):
        clf.fit([[0, 1, 2]], [[0, 1]])
예제 #11
0
def test_input_y_not_encoded():
    clf = MixedNB()
    with pytest.raises(ValueError):
        clf.fit([[1, 2], [2, 2], [3, 3]], [0, 8, 0])
예제 #12
0
def test_input_string_x():
    clf = MixedNB()
    with pytest.raises(TypeError):
        clf.fit([['X'], ['y']], [0, 1])
예제 #13
0
def test_input_string_y():
    clf = MixedNB()
    with pytest.raises(TypeError):
        clf.fit([[2], [1]], [0, '1'])
예제 #14
0
def test_categorical_data_simple():
    X, y = load_example()

    mixed_nb = MixedNB([0, 1])
    mixed_nb.fit(X, y)
    mixed_nb.score(X, y)
예제 #15
0
def test_input_param():
    clf = MixedNB(alpha='l')
    with pytest.raises(TypeError):
        clf.fit([0, 1, 2], [0, 1, 0])
예제 #16
0
Run benchmarks on toy datasets provided by sklearn. 
This is to ensure our implementation of Gaussian Naive 
Bayes is the same as sklearn's.
"""

from sklearn.datasets import load_iris, load_digits, \
    load_wine, load_breast_cancer
from sklearn.naive_bayes import GaussianNB
from mixed_naive_bayes import MixedNB

for load_data in [load_iris, load_digits, load_wine,
                  load_breast_cancer]:

    print(f"--- {''.join(load_data.__name__.split('_')[1:])} ---")

    dataset = load_data()

    X = dataset['data']
    y = dataset['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_pred = gaussian_nb.predict(X)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_pred = mixed_nb.predict(X)

    print(f"GaussianNB: {gaussian_nb.score(X,y)}")
    print(f"MixedNB   : {mixed_nb.score(X,y)}")
    cat_train = X_train.iloc[:,len(ContinousVariable):]
    SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)
    # Standardise continuous features in the test set
    cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:len(ContinousVariable)]), columns=(ContinousVariable))
    cat_test = X_test.iloc[:,len(ContinousVariable):]
    SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)
    
    #######################################
    ### Model construction: Naive Bayes ###
    #######################################
    # Define the classifier
    # Does not require hyperparameter tuning - just need to identify the categorical variables
    out = np.argwhere(X_train.columns.isin(list(X_train[Catergorical.columns]))).ravel()
    #out= list(complete_subset_features[Catergorical.columns])
    
    best_clf = MixedNB(categorical_features= out)
    filename2 = Output_name+'Naive_Bayes_fitted_model.mod'
    with open( filename2, "wb") as file:
        pickle.dump(best_clf, file)
    
    print(out)
    # Fit  model
    #best_clf.fit(SX_train.astype(float),y_train.astype(int))

    # Train the model using the training sets
    #scores = cross_val_score(best_clf, SX_train.astype(float), y_train.astype(int), n_jobs=16, cv=StratifiedKFold(5))
    #accuracy = scores.mean()
    #sd= (scores.std())     
    # 0.8330165782220578 +/- 0.029993214054194695

    # Fit optimised model
예제 #18
0
# %%
mnb = MultinomialNB()

# %%
mnb.fit(X,y)
y_hat=mnb.predict(X)
y_hat


#%%
X = df.iloc[:,1:]
y = df.Credit_rating
X
# %%
from mixed_naive_bayes import MixedNB
mnb = MixedNB(categorical_features=[1,2,3,4])

# %%
mnb.fit(X,y)
# %%
from sklearn.preprocessing import LabelEncoder
lm = LabelEncoder()
lm.fit([1,2,3])

# %%
X_1 = X
X_1.iloc[:,1:2] = lm.transform(X.iloc[:,1:2])
X_1.iloc[:,2:3] = lm.transform(X.iloc[:,2:3])
X_1.iloc[:,3:4] = lm.transform(X.iloc[:,3:4])
X_1.iloc[:,4:5] = lm.transform(X.iloc[:,4:5])
# Take the next 4 columns
soil = np.argwhere(X_raw[:, -44:-40] == 1)[:, 1]
soil = soil[:, np.newaxis]

# Take last 40 columns
wild = np.argwhere(X_raw[:, -40:] == 1)[:, 1]
wild = wild[:, np.newaxis]

# Concat X's and minus 1 from y
# to make categories start from 0
X = np.hstack([quant, soil, wild])
y = y_raw - 1

# Assume all features are Gaussian (using `sklearn`'s library)
clf = GaussianNB()
clf.fit(X, y)

# Assume all features are Gaussian (using our library)
clf = MixedNB()
clf.fit(X, y)

# Assume features `10` and `11` are multinoulli (categorical)
# and the rest Gaussian
clf = MixedNB(categorical_features=[10, 11])
clf.fit(X, y)

print(f"--- forest covertypes---")
print(f"GaussianNB   : {clf.score(X,y)}")
print(f"MixedNB (G)  : {clf.score(X,y)}")
print(f"MixedNB (C+G): {clf.score(X,y)}")
pro2 = pd.read_csv("4-df_preprocessed.csv", sep=',', header=0)

#Divide the dataset into y and X
y = pro2['Severity']
X = pro2.iloc[:,1:]

# Change categorical variables into numerical variables
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]] = X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]].apply(LabelEncoder().fit_transform)

# Split the dataset into training dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.2,random_state=1234, stratify=y)

# Build a Bayesian Classification Model and predict the type using the test data.
gnb = MixedNB(categorical_features=[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35])
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Calculate the accuracy
accuracy = gnb.score(X_test, y_test)
print('Accuracy: {0:.2f}'.format(accuracy))

# Build a confusion matrix
cm = metrics.confusion_matrix(y_test,y_pred)
print(metrics.classification_report(y_test,y_pred))


#Bayesian for PCA dataset
# Load dataset
import numpy as np