示例#1
0
def train_and_test_unbalanced(sql_dir):
	X_train, X_test, y_train, y_test = count_vectorizer(sql_dir)
	X_train_tfidf, X_test_tfidf, y_train, y_test = tfidf_vectorizer_word(sql_dir)
	# X_train_tfidf, X_test_tfidf, y_train, y_test = tfidf_vectorizer_ngram(sql_dir)
	# X_train_tfidf, X_test_tfidf, y_train, y_test = tfidf_vectorizer_chars(sql_dir)

	# Naive Bayes on Count Vectors
	nb, accuracy = train_model(naive_bayes.ComplementNB(), X_train, y_train, X_test, y_test)
	print("NB, Count Vectors: ", accuracy)

	# Naive Bayes on TF-IDF Vectors
	nb_tfidf, accuracy = train_model(naive_bayes.ComplementNB(), X_train_tfidf, y_train, X_test_tfidf, y_test)
	print("NB, TF-IDF Vectors: ", accuracy)

	# Linear Classifier on Count Vectors
	lr, accuracy = train_model(linear_model.LogisticRegression(class_weight='balanced'), X_train, y_train, X_test, y_test)
	print ("LR, Count Vectors: ", accuracy)

	# Linear Classifier on TF-IDF Vectors
	lr_tfidf, accuracy = train_model(linear_model.LogisticRegression(class_weight='balanced'), X_train_tfidf, y_train, X_test_tfidf, y_test)
	print ("LR, TF-IDF Vectors: ", accuracy)

	# RF on Count Vectors
	rf, accuracy = train_model(ensemble.RandomForestClassifier(class_weight='balanced'), X_train, y_train, X_test, y_test)
	print("RF, Count Vectors: ", accuracy)

	# RF on TF-IDF Vectors
	rf_tfidf, accuracy = train_model(ensemble.RandomForestClassifier(class_weight='balanced'), X_train_tfidf, y_train, X_test_tfidf, y_test)
	print("RF, TF-IDF Vectors: ", accuracy)
示例#2
0
def model_build(alg_name, params):
    alg = None
    if alg_name == 'SVM':
        if params['kernel'] == 'linear':
            alg = SVC(C=params['C'], probability=True, kernel='linear')
        elif params['kernel'] == 'rbf':
            alg = SVC(C=params['C'], gamma=params['gammas'], probability=True, kernel=params['kernel'])
        elif params['kernel'] == 'poly':
            alg = SVC(C=params['C'], degree=params['degree'], probability=True, kernel=params['kernel'])
    elif alg_name == 'KNN':
        alg = KNeighborsClassifier(n_neighbors=params['K'], weights=params['weights'], leaf_size=params['leaf_size'])
    elif alg_name == 'Random Forest':
        alg = RandomForestClassifier(n_estimators=params['n_estimators'], criterion=params['criterion'],
                                     max_features=params['max_features'], random_state=1234)
    elif alg_name == 'LightGBM':
        alg = lgb.LGBMClassifier(learning_rate=params['learning_rate'], num_leaves=params['num_leaves'],
                                 n_estimators=params['n_estimators'], objective=params['objective'])
    elif alg_name == 'XGBoost':
        alg = XGBClassifier(objective=params['objective'], eval_metrics=params['eval_metrics'],
                            learning_rate=params['learning_rate'], max_depth=params['max_depth'])
    elif alg_name == 'Naive Bayes':
        if params['distribution'] == 'Multinomial Naive Bayes':
            alg = naive_bayes.MultinomialNB(alpha=params['alpha'], fit_prior=params['fit_prior'])
        elif params['distribution'] == 'Gaussian Naive Bayes':
            alg = naive_bayes.GaussianNB()
        elif params['distribution'] == 'Complement Naive Bayes':
            alg = naive_bayes.ComplementNB(alpha=params['alpha'], fit_prior=params['fit_prior'], norm=params['norm'])
        elif params['distribution'] == 'Bernoulli Naive Bayes':
            alg = naive_bayes.BernoulliNB(alpha=params['alpha'], fit_prior=params['fit_prior'],
                                          binarize=params['binarize'])
        elif params['distribution'] == 'Categorical Naive Bayes':
            alg = naive_bayes.CategoricalNB(alpha=params['alpha'], fit_prior=params['fit_prior'])
    return alg
def set_model(classifier):
    param_grid = {'alpha': [1e-10, 1e-1, 1, 10]}

    if classifier == 'MNB':
        clf = naive_bayes.MultinomialNB(alpha=1e-10, class_prior=[.1, .9])
        param_grid = {
            'alpha': [1e-10, 1e-1, 1, 10],
            'class_prior': [[.1, .9], [.25, .5], [.75, .25]]
        }
    elif classifier == 'BNB':
        clf = naive_bayes.BernoulliNB(alpha=1e-10, class_prior=[.1, .9])
    elif classifier == 'CNB':
        clf = naive_bayes.ComplementNB(alpha=1e-10, norm=True)
    elif classifier == 'KNN':
        clf = neighbors.KNeighborsClassifier(n_neighbors=1)
    elif classifier == 'SVM':
        clf = svm.SVC(kernel='linear')
        param_grid = {'C': [1e-3, 1]}
    elif classifier == 'DT':
        clf = tree.DecisionTreeClassifier()
    elif classifier == 'RF':
        clf = ensemble.RandomForestClassifier()
    else:
        print("Please enter a valid model.")
        raise SystemExit

    return clf, param_grid
示例#4
0
文件: IA.py 项目: BurdescuAlex/AI
def use_cl(train_samples, train_labels):
    def an(sir):
        inner_words = re.split(r'[ \s]\s*', sir)
        return inner_words

    print("USING CLASSIFIER ")
    model = naive_bayes.ComplementNB()

    # If we use tokenizer for the NN, here we use TFIDFVectorizer to transform the text
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer=an)
    train_txt = vectorizer.fit_transform(train_samples['Text'])
    test_txt = vectorizer.transform(test_samples['Text'])
    val_samples = vectorizer.transform(validation_samples['Text'])

    # Use GridSearch to find the best model !
    parameters = {
        'alpha': [
            1, 10, 0.05, 0.1, 0.01, 0.14, 0.13, 0.12, 0.15, 0.16, 0.17, 0.18,
            0.19, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3,
            1.4, 1.5, 10.1, 10.2, 10.3, 10.4, 10.5, 9.9, 9.8, 9.7, 9.6
        ]
    },

    model2 = GridSearchCV(model, parameters)
    model2.fit(train_txt, train_labels['Prediction'])

    print("Best Params: " + str(model2.best_params_))

    model_best = model2.best_estimator_

    scores_cl(model_best, val_samples, validation_labels['Prediction'])
    predict_cl(model_best, test_txt, test_id, output=True)
示例#5
0
def set_model(classifier):
    param_grid = {}

    if classifier == 'MNB':
        clf = naive_bayes.MultinomialNB(alpha=1e-10) 
        param_grid={'class_prior':[[.5,.5],[.48,.52],[.45,.55],[.4,.6],[.6,.4],[.75,.25],[.25,.75],[.9,.1],[.1,.9]], 'alpha':[1e-10,1e-1,1,10,100]}
    elif classifier == 'BNB':
        clf = naive_bayes.BernoulliNB(alpha=1e-1)
    elif classifier == 'CNB':
        clf = naive_bayes.ComplementNB(norm=True)
    elif classifier == 'KNN':
        clf = neighbors.KNeighborsClassifier(n_neighbors=1)
        param_grid={'n_neighbors':[1,5,10], 'leaf_size':[10,30,50]}
        #param_grid = {'p': [1,2], 'n_neighbors':[1,2,3,4,5,6,7,8,9,10], 'leaf_size':[10,20,30,40,50], 'weights':['uniform','distance'], 'algorithm':['auto','ball_tree','kd_tree','brute']}
    elif classifier == 'SVM':
        clf = svm.LinearSVC(C=10) 
        param_grid = {'C':[.001,.01,.1,1,10,100]}
        #param_grid = {'tol': [1e-3, 1e-4, 1e-5, 1e-6], 'dual':[True, False], 'max_iter':[1000,10000], 'C': [.001, .01, .1, 10, 100, 1000], 'class_weight': [{'Y':1.5, 'N':1}, {'Y':3, 'N':1.5}, {'Y':1, 'N':.75}, {'Y':1, 'N':.4}, {'Y':.75, 'N':.25}, {'Y':1, 'N':3}]}
    elif classifier == 'DT':
        clf = tree.DecisionTreeClassifier()
        param_grid={'max_depth':[None,10,100,1000,10000],'min_weight_fraction_leaf':[0,.01,.1,.2,.3,.4,.5]}
    elif classifier == 'RF':
        clf = ensemble.RandomForestClassifier()
    elif classifier == 'NN':
        clf = neural_network.MLPClassifier()
    elif classifier == 'do_all':
        clf = 'all'
    else:
        print("Please enter a valid model.")
        raise SystemExit

    return clf, param_grid
def NB_param_selection(X, y, nfolds):
    alphas = np.linspace(0.001, 10.0, num=100)
    param_grid = {'alpha': alphas}
    grid_search = GridSearchCV(naive_bayes.ComplementNB(),
                               param_grid,
                               cv=nfolds,
                               verbose=4)
    grid_search.fit(X, y)
    return grid_search
示例#7
0
def classification_report(sql_dir):
	vectorizer = CountVectorizer(encoding='latin-1')
	encoder, X_train, X_test, y_train, y_test = dataset_divider(sql_dir, vectorizer, need_encoder = True)
	best_model, accuracy = train_model(naive_bayes.ComplementNB(), X_train, y_train, X_test, y_test)
	y_pred = best_model.predict(X_test)

	precision,recall,fscore,support=score(y_test,y_pred,average='macro')
	print('Precision : {}'.format(precision))
	print('Recall    : {}'.format(recall))
	print('F-score   : {}'.format(fscore))
示例#8
0
def train_nb(_x_train, _y_train, _x_test, _y_test):
    nb_param_grid = dict(alpha=[0, 0.1, 0.5, 1], norm=[False, True])
    nb_grid = RandomizedSearchCV(estimator=naive_bayes.ComplementNB(),
                                 param_distributions=nb_param_grid,
                                 cv=5,
                                 verbose=10,
                                 n_jobs=-1,
                                 scoring='accuracy')
    # naive.fit(Train_X_Tfidf, Y_train)
    # predictions_NB = naive.predict(Test_X_Tfidf)
    nb_grid.fit(_x_train, np.ravel(_y_train))
    predictions_NB = nb_grid.predict(_x_test)
    nb_accr = accuracy_score(predictions_NB, np.ravel(_y_test))

    return nb_grid, nb_accr
示例#9
0
 def init_naive_bayes(self) -> None:
     """
     MultinomialNB works with occurrence counts
     BernoulliNB is designed for binary/boolean features
     """
     all_models = [
         naive_bayes.BernoulliNB(),
         naive_bayes.GaussianNB(),
         naive_bayes.MultinomialNB(),
         naive_bayes.ComplementNB()
     ]
     self.models.extend(all_models)
     models = ["bernoulli", "gaussian", "multinomial", "complement"]
     for mod in models:
         self.model_keys[mod] = "nb"
示例#10
0
    def __init__(self, trainDF):
        super().__init__()
        prePro = PreProcessor()
        self.pf = PlotFunctions()
        self.trainDF = trainDF
        self.X_train, self.X_test, self.y_train, self.y_test = \
            prePro.split_train_test(trainDF['cleaned_sentence'], trainDF['classification'], 0.4)
        self.X_test, self.X_cross, self.y_test, self.y_cross = \
            prePro.split_train_test(self.X_test, self.y_test, 0.5)

        self.all_scores = list()
        self.models = {
            'MultinomialNB':
            naive_bayes.MultinomialNB(alpha=0.767,
                                      class_prior=None,
                                      fit_prior=True),
            'ComplementNB':
            naive_bayes.ComplementNB(alpha=0.767,
                                     class_prior=None,
                                     fit_prior=True),
            'LogisticRegression':
            linear_model.LogisticRegression(solver='lbfgs')
        }
示例#11
0
def document_everything(X_train, X_test, y_train, y_test):
    Reports = {}
    Accuracies = {}
    conf_matrices = {}
    
    clf = svm.LinearSVC(C=10)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['SVM'] = clf.score(X_test, y_test)
    Reports['SVM'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['SVM'] = metrics.confusion_matrix(y_test, y_pred)

    print('SVM Done')

    clf = ensemble.RandomForestClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['RF'] = clf.score(X_test, y_test)
    Reports['RF'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['RF'] = metrics.confusion_matrix(y_test, y_pred)

    print('RF Done')

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['DT'] = clf.score(X_test, y_test)
    Reports['DT'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['DT'] = metrics.confusion_matrix(y_test, y_pred)

    print('DT Done')
    
    clf = neighbors.KNeighborsClassifier(n_neighbors=1,leaf_size=10)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['KNN'] = clf.score(X_test, y_test)
    Reports['KNN'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['KNN'] = metrics.confusion_matrix(y_test, y_pred)

    print('KNN Done')

    clf = naive_bayes.MultinomialNB()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['MNB'] = clf.score(X_test, y_test)
    Reports['MNB'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['MNB'] = metrics.confusion_matrix(y_test, y_pred)
    
    print('MNB Done')
    
    clf = naive_bayes.BernoulliNB(alpha=1e-10)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['BNB'] = clf.score(X_test, y_test)
    Reports['BNB'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['BNB'] = metrics.confusion_matrix(y_test, y_pred)

    print('BNB Done')

    clf = naive_bayes.ComplementNB()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['CNB'] = clf.score(X_test, y_test)
    Reports['CNB'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['CNB'] = metrics.confusion_matrix(y_test, y_pred)

    print('CNB Done')
    
    clf = neural_network.MLPClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['MLP'] = clf.score(X_test, y_test)
    Reports['MLP'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['MLP'] = metrics.confusion_matrix(y_test, y_pred)

    print('CNB Done')

    for clf in Reports:
        print(str(clf))
        print(str(Accuracies[clf]))
        print(str(Reports[clf]))
        print(str(conf_matrices[clf]))
        size = [conf_matrices[clf][0][0],conf_matrices[clf][0][1],conf_matrices[clf][1][0],conf_matrices[clf][1][1]]
        labels = 'True Negatives', 'False Positive', 'False Negative', 'True Positives'
        explode = (0,.1,0,.1)
        fig1, ax1 = plt.subplots()
        ax1.pie(size, explode=explode, labels = labels)
        ax1.axis('equal')
        plt.show()
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        train_size=i,
                                                        random_state=0)

    classifier = naive_bayes.BernoulliNB()
    classifier.fit(X=X_train, y=y_train)
    predicted = classifier.predict(X_test)
    BernoulliNB_accuracy.append(accuracy_score(y_test, predicted))
    '''
    classifier = naive_bayes.CategoricalNB()
    classifier.fit(X=X_train, y= y_train)
    predicted = classifier.predict(X_test)
    CategoricalNB_accuracy.append(accuracy_score(y_test, predicted))
    '''
    classifier = naive_bayes.ComplementNB()
    classifier.fit(X=X_train, y=y_train)
    predicted = classifier.predict(X_test)
    ComplementNB_accuracy.append(accuracy_score(y_test, predicted))

    classifier = naive_bayes.GaussianNB()
    classifier.fit(X=X_train, y=y_train)
    predicted = classifier.predict(X_test)
    GaussianNB_accuracy.append(accuracy_score(y_test, predicted))

    classifier = naive_bayes.MultinomialNB()
    classifier.fit(X=X_train, y=y_train)
    predicted = classifier.predict(X_test)
    MultinomialNB_accuracy.append(accuracy_score(y_test, predicted))

    ## DTs
示例#13
0
                              'avg_RTs', 'sbjACC', 'pid', 'picid'])
    

X = [[data['congruency'], data['response'],data['prev_congruency'], \
      data['prev_response'], data['avg_RTs']] for i in range(len(data['congruency']))]
y = [data ['sbjACC'] for i in range(len(data['congruency']))]
'''

X = [data[i][0:4] for i in range(len(data))]
y = [data[i][5] for i in range(len(data))]

#X = X.norma

X, y = np.array(X), np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
regressor = nb.ComplementNB()
regressor.fit(X_train, y_train)  #training the algorithm

y_pred = regressor.predict(X_test)

print("Naive Bayes")
print("Accuracy Score: ", regressor.score(X, y))
print("Test Score: ", regressor.score(X_test, y_test))
print("Test Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
print("Fl Score: ", metrics.f1_score(y_test, y_pred))
print("Parameters: ", regressor.get_params)

from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix for Training Data:")
print(confusion_matrix(y_train, regressor.predict(X_train)))
import sklearn.naive_bayes as nb
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('spambase.data')
spam_features = data.drop('spam_or_not', axis=1)
spam_labels = data.loc[:, 'spam_or_not']

bayes = {
    'Gaussian': nb.GaussianNB(),
    'Multinomial': nb.MultinomialNB(),
    'Complement': nb.ComplementNB(),
    'Bernoulli': nb.BernoulliNB()
}
for k in bayes.keys():
    predictions = bayes[k].fit(spam_features,
                               spam_labels).predict(spam_features)
    misses = (spam_labels != predictions).sum()
    false_positive = misses - (predictions[predictions != spam_labels]
                               == 1).sum()
    false_negative = misses - false_positive
    print(
        '{} Method Mislabelled {} points {}% {} false positive {} false negative'
        .format(k, misses, misses / spam_labels.shape[0] * 100, false_positive,
                false_negative))
# -*- coding: utf-8 -*-

import numpy as np
from sklearn import naive_bayes

X = np.random.randint(50, size=(1000, 100))
y = np.random.randint(6, size=(1000))

clf = naive_bayes.ComplementNB()
clf.fit(X, y)

print(clf.predict(X[2:3]))
示例#16
0
lin_svc.fit(X_train, y_train)
y_test_pred = lin_svc.predict(X_test)
matrix = confusion_matrix(y_test, y_test_pred)
score = lin_svc.score(X_test, y_test)
no_selection_performance.append(('LinearSVC', score, matrix))

print("Multinomial Naive Bayes")
multNB = naive_bayes.MultinomialNB()
multNB.fit(X_train, y_train)
y_test_pred = multNB.predict(X_test)
matrix = confusion_matrix(y_test, y_test_pred)
score = multNB.score(X_test, y_test)
no_selection_performance.append(('Multinomial Naive Bayes', score, matrix))

print('Complement Naive Bayes')
compNB = naive_bayes.ComplementNB()
compNB.fit(X_train, y_train)
y_test_pred = compNB.predict(X_test)
matrix = confusion_matrix(y_test, y_test_pred)
score = compNB.score(X_test, y_test)
no_selection_performance.append(('Complement Naive Bayes', score, matrix))

print('Gradient Boosting Classifier')
gradBoost = ensemble.GradientBoostingClassifier()
gradBoost.fit(X_train, y_train)
y_test_pred = gradBoost.predict(X_test)
matrix = confusion_matrix(y_test, y_test_pred)
score = gradBoost.score(X_test, y_test)
no_selection_performance.append(
    ('Gradient Boosting Classifier', score, matrix))
示例#17
0
文件: app.py 项目: zack624/nlp-car
def my_naive_bayes(x, y):
    # model = nb.BernoulliNB()
    # model = nb.MultinomialNB()
    model = nb.ComplementNB()
    model.fit(x, y)
    return model
示例#18
0
    
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    accuracy = metrics.accuracy_score(predictions, valid_y)

    return accuracy

	
# Naive Bayes
accuracy = train_model(naive_bayes.MultinomialNB(alpha=1e-5), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, WordLevel TF-IDF: ")
print('Accuracy: ', accuracy)

accuracy = train_model(naive_bayes.ComplementNB(alpha=1e-5), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ")
print('Accuracy: ', accuracy)

accuracy = train_model(naive_bayes.MultinomialNB(alpha=1e-5), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, BigramLevel TF-IDF: ")
print('Accuracy: ', accuracy)

# RF
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)


accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("RF, WordLevel TF-IDF: ", accuracy)
示例#19
0
# 1.2. Train the vectorizer
vectorizer = feature_extraction.text.TfidfVectorizer(min_df=5, max_df=0.1, stop_words='english')
#vectorizer = feature_extraction.text.TfidfVectorizer(stop_words='english') # Try Kim's vectorizer
#vectorizer = feature_extraction.text.TfidfVectorizer(max_df=0.08, stop_words='english', strip_accents='ascii', token_pattern='[A-Za-z]{1,}', ngram_range=(1,2), sublinear_tf=True) # Try Seo's vectorizer
vectorizer.fit(train_raw.data)
print('* The size of vocabulary: ', len(vectorizer.vocabulary_))

# 1.3. Vectorize the training and test data
train_vectors = vectorizer.transform(train_raw.data)
tests_vectors = vectorizer.transform(tests_raw.data)

# 2. Instantiate classifier models
models = [
    {'name': 'linear_model.SGD',   'inst': linear_model.SGDClassifier()},
    {'name': 'naive_bayes.CompNB', 'inst': naive_bayes.ComplementNB(alpha=0.4)},
    {'name': 'svm.LinearSVC',      'inst': svm.LinearSVC(class_weight='balanced')},
    {'name': 'svm.SVC(linear)',    'inst': svm.SVC(kernel='linear', class_weight='balanced')},
    {'name': 'svm.SVC(rbf)',       'inst': svm.SVC(class_weight='balanced')},
    {'name': 'neural_network.MLP', 'inst': neural_network.MLPClassifier(learning_rate='adaptive', early_stopping=True, verbose=True)},
]

# 3. Evaluate the classifier models
for m in models:
    # Train the model
    m['inst'].fit(train_vectors, train_raw.target)
    train_predict = m['inst'].predict(train_vectors)
    train_accuracy = metrics.balanced_accuracy_score(train_raw.target, train_predict)

    # Test the model
    tests_predict = m['inst'].predict(tests_vectors)
示例#20
0
data = data.reindex(columns=reorder_colnames)
data = pd.get_dummies(data, columns=['race'])

features = ['age', 'fnlwgt', 'work_ Private','work_ Self-emp','work_ Government', 'edunum', 'marital', 'relation', 'sex', 'gain', 'loss', 'hpw', 'country',
            'occu_ Adm-clerical', 'occu_ Armed-Forces', 'occu_ Craft-repair', 'occu_ Exec-managerial', 'occu_ Farming-fishing', 'occu_ Handlers-cleaners',
            'occu_ Machine-op-inspct', 'occu_ Other-service', 'occu_ Priv-house-serv', 'occu_ Prof-specialty', 'occu_ Protective-serv', 'occu_ Sales',
            'occu_ Tech-support', 'occu_ Transport-moving', 'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White']
y = data['income']
X = data[features]
print('Using features: ' + str(features))

# Define the Naive Bayes models
gaussianModel = nb.GaussianNB()
bernoulliModel = nb.BernoulliNB()
multinomialModel = nb.MultinomialNB()
complementModel = nb.ComplementNB()

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Test options and evaluation metric
scoring = 'accuracy'

# Fit the training sets
gaussianModel.fit(X_train, y_train)
bernoulliModel.fit(X_train, y_train)
multinomialModel.fit(X_train, y_train)
complementModel.fit(X_train, y_train)

# Predict for the test sets
predG = gaussianModel.predict(X_test)
示例#21
0
class CustomComplementNB:
    def __init__(self, model):
        self.feature_log_pro_ = model.feature_log_prob_.T

    def predict_proba(self, X):
        jll = np.dot(X, self.feature_log_pro_)
        log_prob_x = np.log(np.sum(np.exp(jll), axis=1))
        result = jll - np.atleast_2d(log_prob_x).T
        return np.exp(result)


if __name__ == '__main__':
    feature, label = Iris.features, Iris.label
    train_feature, test_feature, train_label, test_label = data_split.split(
        feature, label)
    nativeBayes = bayes.GaussianNB()
    nativeBayes.fit(train_feature, train_label)
    pred = nativeBayes.predict_proba(test_feature)[0]
    print("pred", pred)
    myBayes = CustomGaussianBayes(nativeBayes)
    my_pred = myBayes.predict_proba(test_feature)[0]
    print("my_pred", my_pred)

    complementBayes = bayes.ComplementNB()
    complementBayes.fit(train_feature, train_label)
    pred = complementBayes.predict_proba(test_feature)[0]
    print("pred", pred)
    myComplementBayes = CustomComplementNB(complementBayes)
    my_pred = myComplementBayes.predict_proba(test_feature)[0]
    print("my_pred", my_pred)
# RF on Count Vectors
rf, accuracy = train_model(ensemble.RandomForestClassifier(), X_train, y_train, X_test, y_test)
print("RF, Count Vectors: ", accuracy)

# RF on TF-IDF Vectors
rf_tfidf, accuracy = train_model(ensemble.RandomForestClassifier(), X_train_tfidf, y_train, X_test_tfidf, y_test)
print("RF, TF-IDF Vectors: ", accuracy)


# ## For unbalanced dataset

# In[20]:


# Naive Bayes on Count Vectors
nb, accuracy = train_model(naive_bayes.ComplementNB(), X_train, y_train, X_test, y_test)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on TF-IDF Vectors
nb_tfidf, accuracy = train_model(naive_bayes.ComplementNB(), X_train_tfidf, y_train, X_test_tfidf, y_test)
print("NB, TF-IDF Vectors: ", accuracy)

# Linear Classifier on Count Vectors
lr, accuracy = train_model(linear_model.LogisticRegression(class_weight='balanced'), X_train, y_train, X_test, y_test)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on TF-IDF Vectors
lr_tfidf, accuracy = train_model(linear_model.LogisticRegression(class_weight='balanced'), X_train_tfidf, y_train, X_test_tfidf, y_test)
print ("LR, TF-IDF Vectors: ", accuracy)

# RF on Count Vectors