예제 #1
0
# individual
#

# def individual(args):
#     name_cls = args[0]
#     wX = args[1]
#     wy = args[2]
#     return name_cls.fit(wX, wy)

def individual(name_cls, wX, wy):
    return name_cls.fit(wX, wy)

NAME_INDIVIDUALS = {
    'DT'  : tree.DecisionTreeClassifier(),
    'NB'  : naive_bayes.GaussianNB(),
    'SVM' : svm.SVC(gamma='scale'),
    'LSVM': svm.LinearSVC(),
    'KNNu': neighbors.KNeighborsClassifier(weights='uniform'),
    'KNNd': neighbors.KNeighborsClassifier(weights='distance'),
    'LM1' : linear_model.SGDClassifier(penalty='l1'),
    'LM2' : linear_model.SGDClassifier(penalty='l2'),
}



#----------------------------------------
# EnsembleVoting
#----------------------------------------

예제 #2
0
    #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
    ('ada', ensemble.AdaBoostClassifier()),
    ('bc', ensemble.BaggingClassifier()),
    ('etc',ensemble.ExtraTreesClassifier()),
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('rfc', ensemble.RandomForestClassifier()),

    #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
    ('gpc', gaussian_process.GaussianProcessClassifier()),

    #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
    ('lr', linear_model.LogisticRegressionCV()),

    #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
    ('bnb', naive_bayes.BernoulliNB()),
    ('gnb', naive_bayes.GaussianNB()),

    #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
    ('knn', neighbors.KNeighborsClassifier()),

    #SVM: http://scikit-learn.org/stable/modules/svm.html
    ('svc', svm.SVC(probability=True)),

    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
   ('xgb', XGBClassifier())

]


#Hard Vote or majority rules
vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
예제 #3
0
파일: cm.py 프로젝트: hanyang7427/study
import sklearn.metrics as sm
import matplotlib.pyplot as mp
import mpl_toolkits.axes_grid1 as mg
x, y = [], []
with open('multiple.txt', 'r') as f:
	for line in f.readlines():
		data = [float(substr) for substr in line.split(',')]
		x.append(data[:-1])
		y.append(data[-1])
x = np.array(x)
y = np.array(y, dtype=int)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
train_x, test_x, train_y, test_y = ms.train_test_split(
	x, y, test_size=0.25, random_state=5)
model = nb.GaussianNB()
pc = ms.cross_val_score(model, x, y, cv=10,
	scoring='precision_weighted')
print(round(pc.mean(), 2))
rc = ms.cross_val_score(model, x, y, cv=10,
	scoring='recall_weighted')
print(round(rc.mean(), 2))
f1 = ms.cross_val_score(model, x, y, cv=10,
	scoring='f1_weighted')
print(round(f1.mean(), 2))
ac = ms.cross_val_score(model, x, y, cv=10,
	scoring='accuracy')
print(round(ac.mean(), 2))
model.fit(train_x, train_y)
grid_x = np.meshgrid(np.arange(l, r, h), np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
예제 #4
0
import os
from sklearn import naive_bayes, model_selection

path = 'C:\\Users\\Algorithmica\\Downloads'
titanic_train = pd.read_csv(os.path.join(path, 'titanic_train.csv'))
print(titanic_train.shape)
print(titanic_train.info())

features = ['Sex', 'Pclass', 'Embarked', 'Parch', 'SibSp']
titanic_train1 = pd.get_dummies(titanic_train,
                                columns=['Sex', 'Pclass', 'Embarked'])
X_train = titanic_train1.drop(
    ['PassengerId', 'Survived', 'Name', 'Age', 'Cabin', 'Ticket'], axis=1)
y_train = titanic_train['Survived']

classifier = naive_bayes.GaussianNB()
classifier.fit(X_train, y_train)
print(classifier.class_prior_)
print(classifier.sigma_)
print(classifier.theta_)

res = model_selection.cross_validate(classifier, X_train, y_train, cv=10)
res.get('test_score').mean()
res.get('train_score').mean()

titanic_test = pd.read_csv(os.path.join(path, 'titanic_test.csv'))
print(titanic_test.shape)
print(titanic_test.info())
titanic_test.loc[titanic_test['Fare'].isnull() == True,
                 'Fare'] = titanic_test['Fare'].mean()
예제 #5
0
def test_GaussianNB(N=10):
    np.random.seed(12345)
    N = np.inf if N is None else N

    i = 1
    while i < N + 1:
        n_ex = np.random.randint(1, 300)
        n_feats = np.random.randint(1, 100)
        n_classes = np.random.randint(2, 10)

        X = random_tensor((n_ex, n_feats), standardize=True)
        y = np.random.randint(0, n_classes, size=n_ex)

        X_test = random_tensor((n_ex, n_feats), standardize=True)

        NB = GaussianNBClassifier(eps=1e-09)
        NB.fit(X, y)

        preds = NB.predict(X_test)

        sklearn_NB = naive_bayes.GaussianNB()
        sklearn_NB.fit(X, y)

        sk_preds = sklearn_NB.predict(X_test)

        for i in range(len(NB.labels)):
            P = NB.parameters
            jointi = np.log(sklearn_NB.class_prior_[i])
            jointi_mine = np.log(P["prior"][i])

            np.testing.assert_almost_equal(jointi, jointi_mine)

            n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[i, :]))
            n_ij_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P["sigma"][i]))

            np.testing.assert_almost_equal(n_ij_mine, n_ij)

            n_ij2 = n_ij - 0.5 * np.sum(
                ((X_test - sklearn_NB.theta_[i, :])**2) /
                (sklearn_NB.sigma_[i, :]), 1)

            n_ij2_mine = n_ij_mine - 0.5 * np.sum(
                ((X_test - P["mean"][i])**2) / (P["sigma"][i]), 1)
            np.testing.assert_almost_equal(n_ij2_mine, n_ij2, decimal=4)

            llh = jointi + n_ij2
            llh_mine = jointi_mine + n_ij2_mine

            np.testing.assert_almost_equal(llh_mine, llh, decimal=4)

        np.testing.assert_almost_equal(P["prior"], sklearn_NB.class_prior_)
        np.testing.assert_almost_equal(P["mean"], sklearn_NB.theta_)
        np.testing.assert_almost_equal(P["sigma"], sklearn_NB.sigma_)
        np.testing.assert_almost_equal(
            sklearn_NB._joint_log_likelihood(X_test),
            NB._log_posterior(X_test),
            decimal=4,
        )
        np.testing.assert_almost_equal(preds, sk_preds)
        print("PASSED")
        i += 1
예제 #6
0
    return X


if __name__ == '__main__':

    # Carregar dataset
    X, y = get_dataset()
    # Dividir o dataset em dados de treinamento e test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    # Treinar classficador
    #clf_1 = svm.SVC()
    clf_1 = naive_bayes.GaussianNB()
    clf_1.fit(X_train, y_train)
    # Obter previsão
    y_pred = clf_1.predict(X_test)
    # Obter acurácia
    print("Acurácia do classificador NB sem informação etimológica: ")
    print(accuracy_score(y_test, y_pred))

    X = get_etydataset(X)
    # Dividir o dataset em dados de treinamento e test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    # Treinar classficador
    #clf_2 = svm.SVC()
clf.fit(x_treino, y_treino)
pred = clf.predict(x_teste)
print('\n\nClassificador K-NN..................................')
print('\n Predição:')
print(pred)
print('\nReal:')
print(y_teste)
print('\nMatriz de confusão:')
print(metrics.confusion_matrix(y_teste, pred))
print('\nRelatório de classificação:')
print(metrics.classification_report(y_teste, pred))
print('acurácia')
print(metrics.accuracy_score(y_teste, pred))

#________________________________________________________________
clf = naive_bayes.GaussianNB()
clf.fit(x_treino, y_treino)
pred = clf.predict(x_teste)
print('\n\nClassificador Bayes.................................')
print('\n Predição:')
print(pred)
print('\nReal:')
print(y_teste)
print('\nMatriz de confusão:')
print(metrics.confusion_matrix(y_teste, pred))
print('\nRelatório de classificação:')
print(metrics.classification_report(y_teste, pred))
print('acurácia')
print(metrics.accuracy_score(y_teste, pred))

#________________________________________________________________
예제 #8
0
 def createPipeline(self):
     self.pipeline = Pipeline([('scaler', StandardScaler()),
                               ('model', naive_bayes.GaussianNB())])
예제 #9
0
# offset the tick
plt.gca().set_xticks(tick_marks, minor=True)
plt.gca().set_yticks(tick_marks, minor=True)
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.grid(True, which='minor', linestyle='-')
plt.gcf().subplots_adjust(bottom=0.15)

plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
# show confusion matrix
plt.savefig('/content/lg_confusion_matrix.png', format='png')

"""## Bayes"""

import sklearn.naive_bayes as nb
bayes = nb.GaussianNB()
bayes.fit(X_train, y_train)
pred=bayes.predict(X_test)
print('Test accuracy',accuracy_score(y_test, pred))
# dump(lr,"bayes.joblib")

cm = confusion_matrix(y_test, pred)
np.set_printoptions(precision=2)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print (cm_normalized)
plt.figure(figsize=(12, 8), dpi=120)

ind_array = np.arange(len(labels))
x, y = np.meshgrid(ind_array, ind_array)

for x_val, y_val in zip(x.flatten(), y.flatten()):
    TSFV = []
    for dimCounter in range(dimTS):
        TSFV.append(sheetFV.cell_value(TSCounter, dimCounter))
    listTSFV.append(TSFV)

#creating a list of Training Sample Labels (assumming they are in the first column of the worksheet)
listTSLabels = []
for TSCounter in range(noTS):
    listTSLabels.append(sheetLabels.cell_value(TSCounter, 0))

#creating classifiers with the models: 1. Decision Tree 2. SVC 3. KNN 4. Naive Bayes Gaussian and storing them in a list 'listClfs'
listClfs = []
clfDT = tree.DecisionTreeClassifier()
clfSVC = svm.LinearSVC()
clfKN = neighbors.KNeighborsClassifier()
clfGNB = naive_bayes.GaussianNB()
listClfs.extend([clfDT, clfSVC, clfKN, clfGNB])

#training the classifiers
for clf in listClfs:
    clf.fit(listTSFV, listTSLabels)

#testing the classifier on 'testInput' [LIST of test Sample FVs(themselves a list): so a list of lists] and storing the predicted labels in listPredLabels (again a list of lists)
testInput = [[190, 70, 43], [160, 53, 38]]
listPredLabels = []
for clf in listClfs:
    listPredLabels.append(clf.predict(testInput))

#using the method 'sklearn.metrics.accuracy_score' to evaluate the classifier performance; normalize = True(default) => % score (remem 'T' capital in 'True') else absolute no of correct classifications
trueLabelsTestInput = ['male', 'female']
for predLabels in listPredLabels:
예제 #11
0
    plt.scatter(qualified_candidates["technical_grade"],
                qualified_candidates["english_grade"],
                color="w")
    plt.scatter(unqualified_candidates["technical_grade"],
                unqualified_candidates["english_grade"],
                color="k")


qualifies_double_grade_df = pd.read_csv("data/double_grade_reevaluated.csv")

X = qualifies_double_grade_df[["technical_grade", "english_grade"]]
y = qualifies_double_grade_df["qualifies"]

sns.pairplot(qualifies_double_grade_df, hue="qualifies")

k_folds = ms.StratifiedKFold(n_splits=4, shuffle=True)

naive_bayes_model = sk_naive_bayes.GaussianNB()
cv_predictions = ms.cross_val_predict(naive_bayes_model, X, y, cv=k_folds)

confusion_matrix = metrics.confusion_matrix(y, cv_predictions)
print(confusion_matrix)

naive_bayes_model.fit(X, y)

plt.figure()
plot_model(naive_bayes_model, qualifies_double_grade_df)

plt.show()
예제 #12
0
 def __init__(self, ):
     super().__init__(name="GaussianNB")
     self.clf = naive_bayes.GaussianNB()
예제 #13
0
파일: bayes.py 프로젝트: Podlewski/ADZ
 def __init__(self, data, labels, training_set_ratio):
     super().__init__(data, labels, training_set_ratio)
     self.name = 'Naiwny klasyfikator Bayesa'
     self.short_name = 'Bayes'
     self.params_string = ''
     self.model = naive_bayes.GaussianNB()
예제 #14
0
파일: roc.py 프로젝트: kob22/pracamgr
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
import matplotlib.pyplot as plt
from data import importdata
import numpy as np
from sklearn import naive_bayes
from sklearn import tree

dataset = ['load_german', 'load_haberman', 'load_transfusion', 'load_ionosphere', 'load_balance_scale', 'load_bupa',
           'load_car', 'load_cmc', 'load_ecoli',
           'load_glass', 'load_new_thyroid', 'load_seeds', 'load_solar_flare', 'load_vehicle', 'load_vertebal',
           'load_yeastME1', 'load_yeastME2', 'load_yeastME3',
           'load_abalone0_4', 'load_abalone16_29', 'load_abalone0_4_16_29']
db = getattr(importdata, dataset[20])()
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', naive_bayes.GaussianNB())])
pipe_lr2 = Pipeline([('scl', StandardScaler()),
                     ('clf', LogisticRegression(penalty='l2',
                                                random_state=0,
                                                C=1.0))])

X_train, X_test, y_train, y_test = train_test_split(db.data, db.target, test_size=0.3, stratify=db.target,
                                                    random_state=5)

fig = plt.figure(figsize=(7, 5), facecolor='white')

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

probas = pipe_lr.fit(X_train,
예제 #15
0
ohe.fit(titanic[ohe_features])
print(ohe.n_values_)
tmp1 = ohe.transform(titanic[ohe_features]).toarray()

features = ['Age', 'Fare', 'Parch' , 'SibSp', 'FamilySize']
tmp2 = titanic[features].values

tmp = np.concatenate((tmp1,tmp2), axis=1)

X_train = tmp[:titanic_train.shape[0]]
y_train = titanic_train['Survived']

sns.distplot(X_train[:,33], hist=False)


nb_estimator = naive_bayes.GaussianNB()
nb_estimator.fit(X_train, y_train)

print(nb_estimator.class_prior_)
print(nb_estimator.sigma_)
print(nb_estimator.theta_)

res = model_selection.cross_validate(nb_estimator, X_train, y_train, cv=10)
print(res.get('test_score').mean())
print(nb_estimator.score(X_train, y_train))

X_test = tmp[titanic_train.shape[0]:]

titanic_test['Survived'] = nb_estimator.predict(X_test)
titanic_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["PassengerId", "Survived"], index=False)
예제 #16
0
from sklearn import tree, ensemble, svm, naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
from scipy import stats

if __name__ == '__main__':
    features = np.loadtxt("data/w2v/w2v_embeddings.txt")
    classes = np.loadtxt("data/w2v/w2v_labels.txt").astype(int)
    nbc = naive_bayes.GaussianNB()
    nbc_scores = cross_val_score(nbc, features, classes, cv=5)
    print('Naive bayes mean accuracy : %.2f' % (nbc_scores.mean()))

    rfc = ensemble.RandomForestClassifier()
    rfc_scores = cross_val_score(rfc, features, classes, cv=5)
    print('Random forest mean accuracy : %.2f' % (rfc_scores.mean()))

    lrc = LogisticRegression()
    lrc_scores = cross_val_score(lrc, features, classes, cv=5)
    print('LR mean accuracy : %.2f' % (lrc_scores.mean()))
def spot_check(task, seed, splits, df, y):
    results = {}
    seed = seed
    kfold = KFold(n_splits=splits, random_state=seed)

    if task in ('c', 'C'):
        print("Spot Checking Classification Algorithms: ")
        #Classification Models!
        #LINEAR Classification models:
        #Logistic Regression
        model = linear_model.LogisticRegression()
        result = cross_val_score(model, df, y, cv=kfold)
        results['LoR'] = result.mean()
        print("Linear-\nLogistic Regression: ", results['LoR'])
        #LDA
        model = discriminant_analysis.LinearDiscriminantAnalysis()
        result = cross_val_score(model, df, y, cv=kfold)
        results['LDA'] = result.mean()
        print("LDA score: ", results['LDA'], end="\n\n")
        #NON-LINEAR Classification models:
        #KNN
        model = neighbors.KNeighborsClassifier(
        )  #Careful of the spelling of Neighbors
        result = cross_val_score(model, df, y, cv=kfold)
        results['KNNC'] = result.mean()
        print("Non-Linear-\nKNN: ", results['KNNC'])
        #Naive Bayes
        model = naive_bayes.GaussianNB()
        result = cross_val_score(model, df, y, cv=kfold)
        results['NBayes'] = result.mean()
        print("Naive Bayes: ", results['NBayes'])
        #Classification and Regression Trees / decision trees
        model = tree.DecisionTreeClassifier()
        result = cross_val_score(model, df, y, cv=kfold)
        results['CARTC'] = result.mean()
        print("CART: ", results['CARTC'])
        #Support Vector Machines
        model = svm.SVC()
        result = cross_val_score(model, df, y, cv=kfold)
        results['SVC'] = result.mean()
        print("Support Vector Machine: ", results['SVC'])
    elif task in ('r', 'R'):
        print("Spot Checking Regression Algorithms: ")
        scoring = 'neg_mean_squared_error'
        #Regression Models
        #LINEAR Regression Models
        #Linear Regression
        model = linear_model.LinearRegression()
        result = cross_val_score(model, df, y, cv=kfold, scoring=scoring)
        results['LiR'] = result.mean()
        print("Linear-\nLinear Regression: ", results['LiR'])
        #Ridge Regression (L2 norm)
        model = linear_model.Ridge()
        result = cross_val_score(model, df, y, cv=kfold, scoring=scoring)
        results['RR'] = result.mean()
        print("Ridge Regression: ", results['RR'])
        #Least Absolute Shrinkage and Selection Operator (L1 norm)
        model = linear_model.Lasso()
        result = cross_val_score(model, df, y, cv=kfold, scoring=scoring)
        results['LASSO'] = result.mean()
        print("LASSO: ", results['LASSO'])
        #ElasticNet Regression (L1 and L2 norm)
        model = linear_model.ElasticNet()
        result = cross_val_score(model, df, y, cv=kfold, scoring=scoring)
        results['ENet'] = result.mean()
        print("ElasticNet Regression: ", results['ENet'])
        #NON LINEAR Regression models
        #K-Nearest Neighbours
        model = neighbors.KNeighborsRegressor()
        result = cross_val_score(model, df, y, cv=kfold, scoring=scoring)
        results['KNNR'] = result.mean()
        print("Non-Linear-\nKNN: ", results['KNNR'])
        #Classification and Regression Trees
        model = tree.DecisionTreeRegressor()
        result = cross_val_score(model, df, y, cv=kfold, scoring=scoring)
        results['CARTR'] = result.mean()
        print("CART: ", results['CARTR'])
        #Support Vector Machine
        model = svm.SVR()
        result = cross_val_score(model, df, y, cv=kfold, scoring=scoring)
        results['SVR'] = result.mean()
        print("Support Vector Machine: ", results['SVR'])
    else:
        print("Invalid task definition (r/c)!")

    #Select top three in spot checked algorithms
    res = []
    for i in range(3):
        res.append(max(results.items(), key=op.itemgetter(1))[0])
        del results[res[i]]
    return res
예제 #18
0
'''
for dataset in kk.getDatasets():
    cnt = cnt+1
    print ('Dataset Name:',dataset.DESCR[0:10])
    X = dataset.data
    Y = dataset.target
    X = kk.MeanNormalizer(X)
    n_comp = X.shape[1] - cnt
    if n_comp <= 0:
        n_comp = X.shape[1]
    pca.set_params(n_components = n_comp)
    X = pca.fit_transform(X)
    print('Score:',kk.fitModel(bayes, X, Y))
    del X,Y
'''
gauss = skn.GaussianNB()
bernoulli =skn.BernoulliNB()
multi = skn.MultinomialNB()
for dataset in kk.getDatasets(binary=True):
    #dataset = skd.load_breast_cancer()
    print ('Dataset Name:',dataset.DESCR[0:20],'\n=================================')
    X = dataset.data + np.random.random(size=(dataset.data.shape))
    Y = dataset.target
    if(len(np.unique(Y)) > 2):
        bernoulli.set_params(binarize=True)
    else:
        print('Binary Classification')
    X = kk.MeanNormalizer(X)
    pca.set_params(n_components=np.random.randint(1, X.shape[1]+1))
    X = pca.fit_transform(X)
    kk.fitModel(bayes,X,Y)
testY = numpy.array(multinomialNB.predict(x))
trainY = numpy.array(y)
ErrorY = numpy.array(numpy.subtract(testY, trainY))

error = 0
for k in range(0, ErrorY.size, 1):
    if ErrorY[k] != 0:
        error = error + 1
sizeOfErrorY = ErrorY.size
error = float(float(error) / float(sizeOfErrorY))
error = error * float(100)
print "Multinomial NB Percentage of error on training set is " + str(error)
#Multinomial Naive Bayes Classification

guassianNB = naive_bayes.GaussianNB()
guassianNB.fit(x, y)

testY = numpy.array(guassianNB.predict(x))
trainY = numpy.array(y)
ErrorY = numpy.array(numpy.subtract(testY, trainY))

error = 0
for k in range(0, ErrorY.size, 1):
    if ErrorY[k] != 0:
        error = error + 1
sizeOfErrorY = ErrorY.size
error = float(float(error) / float(sizeOfErrorY))
error = error * float(100)
print "Guassian NB Percentage of error on training set is " + str(error)
#Guassian Naive Bayes Classification
예제 #20
0
]]
clfs = {}

#clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name':'LogisticRegression'}
#clfs['rf'] = {'clf': ensemble.RandomForestClassifier(n_estimators=750, n_jobs=-1), 'name':'RandomForest'}
clfs['tr'] = {'clf': tree.DecisionTreeClassifier(), 'name': 'DecisionTree'}
#clfs['knn'] = {'clf': neighbors.KNeighborsClassifier(n_neighbors=4), 'name':'kNearestNeighbors'}
#clfs['svc'] = {'clf': svm.SVC(kernel='linear'), 'name': 'SupportVectorClassifier'}
clfs['nusvc'] = {'clf': svm.NuSVC(), 'name': 'NuSVC'}
clfs['linearsvc'] = {'clf': svm.LinearSVC(), 'name': 'LinearSVC'}
clfs['SGD'] = {'clf': linear_model.SGDClassifier(), 'name': 'SGDClassifier'}
clfs['GPC'] = {
    'clf': gaussian_process.GaussianProcessClassifier(),
    'name': 'GaussianProcess'
}
clfs['nb'] = {'clf': naive_bayes.GaussianNB(), 'name': 'GaussianNaiveBayes'}
clfs['bag'] = {
    'clf':
    ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(),
                               max_samples=0.5,
                               max_features=0.5),
    'name':
    "BaggingClassifier"
}
clfs['gbc'] = {
    'clf': ensemble.GradientBoostingClassifier(),
    'name': 'GradientBoostingClassifier'
}
#clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(100,100,100), alpha=1e-5, solver='lbfgs', max_iter=500), 'name': 'MultilayerPerceptron'}
parameters = {
    'solver': ['lbfgs'],
예제 #21
0
def naive_bayes_model(x_train, y_train, x_test, y_test):
    model = naive_bayes.GaussianNB()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    cal_scores(y_pred, y_test, 'GaussianNB')
예제 #22
0
def initialize_models(model_type, parameters, n_cores = -1):
    """
    Function initializes list of models

    Inputs:
    model_type -- model type
    parameters -- hyperparameters dictionary
    n_cores -- number of cores to us, default 1
                               
    Results:
    Initialized model
    """
    
    if model_type == "RandomForest":
        return ensemble.RandomForestClassifier(
            n_estimators=parameters['n_estimators'],
            max_features=parameters['max_features'],
            criterion=parameters['criterion'],
            max_depth=parameters['max_depth'],
            min_samples_split=parameters['min_samples_split'],
            random_state=parameters['random_state'],
            n_jobs=n_cores)

    elif model_type == 'ScaledLogisticRegression':
        return ScaledLogisticRegression(
                C=parameters['C_reg'],
                penalty=parameters['penalty'],
                random_state=parameters['random_state'],
                n_jobs = n_cores)

    elif model_type == "RandomForestBagging":
        #TODO Make Model Bagging
        return ensemble.BaggingClassifier(
                    ensemble.RandomForestClassifier(
                        n_estimators=parameters['n_estimators'],
                        max_features=parameters['max_features'],
                        criterion=parameters['criterion'],
                        max_depth=parameters['max_depth'],
                        min_samples_split=parameters['min_samples_split'],
                        random_state=parameters['random_state'],
                        n_jobs=n_cores),
                    #Bagging parameters
                    n_estimators=parameters['n_estimators_bag'],
                    max_samples=parameters['max_samples'],
                    max_features=parameters['max_features_bag'],
                    bootstrap=parameters['bootstrap'],
                    bootstrap_features=parameters['bootstrap_features'],
                    n_jobs=n_cores)

    elif model_type == "RandomForestBoosting":
        #TODO Make Model Boosting
        return ensemble.AdaBoostClassifier(
            ensemble.RandomForestClassifier(
                n_estimators=parameters['n_estimators'],
                max_features=parameters['max_features'],
                criterion=parameters['criterion'],
                max_depth=parameters['max_depth'],
                min_samples_split=parameters['min_samples_split'],
                random_state=parameters['random_state'],
                n_jobs=n_cores),
            #Boosting parameters
            learning_rate=parameters['learning_rate'],
            algorithm=parameters['algorithm'],
            n_estimators=parameters['n_estimators_boost'])

    elif model_type == 'SVM':
        return svm.SVC(C=parameters['C_reg'],
                       kernel=parameters['kernel'],
                       probability=True,
                       random_state=parameters['random_state'])

    elif model_type == 'LogisticRegression':
        return linear_model.LogisticRegression(
            C=parameters['C_reg'],
            penalty=parameters['penalty'],
            random_state=parameters['random_state'])

    elif model_type == 'AdaBoost':
        return ensemble.AdaBoostClassifier(
            learning_rate=parameters['learning_rate'],
            algorithm=parameters['algorithm'],
            n_estimators=parameters['n_estimators'],
            random_state=parameters['random_state'])

    elif model_type == 'ExtraTrees':
        return ensemble.ExtraTreesClassifier(
            n_estimators=parameters['n_estimators'],
            max_features=parameters['max_features'],
            criterion=parameters['criterion'],
            max_depth=parameters['max_depth'],
            min_samples_split=parameters['min_samples_split'],
            random_state=parameters['random_state'],
            n_jobs=n_cores)

    elif model_type == 'GradientBoostingClassifier':
        return ensemble.GradientBoostingClassifier(
            n_estimators=parameters['n_estimators'],
            learning_rate=parameters['learning_rate'],
            subsample=parameters['subsample'],
            max_depth=parameters['max_depth'],
            random_state=parameters['random_state'])

    elif model_type == 'GaussianNB':
        return naive_bayes.GaussianNB()

    elif model_type == 'DecisionTreeClassifier':
        return tree.DecisionTreeClassifier(
            max_features=parameters['max_features'],
            criterion=parameters['criterion'],
            max_depth=parameters['max_depth'],
            min_samples_split=parameters['min_samples_split'],
            random_state=parameters['random_state'])

    elif model_type == 'SGDClassifier':
        return linear_model.SGDClassifier(
            loss=parameters['loss'],
            penalty=parameters['penalty'],
            random_state=parameters['random_state'],
            n_jobs=n_cores)

    elif model_type == 'KNeighborsClassifier':
        return neighbors.KNeighborsClassifier(
            n_neighbors=parameters['n_neighbors'],
            weights=parameters['weights'],
            algorithm=parameters['algorithm'],
            n_jobs=n_cores)

    else:
        raise ConfigError("Sorry, unsupported model {}".format(model_type))
예제 #23
0
#
예제 #24
0
파일: abalone.py 프로젝트: tqtifnypmb/ML
def gaussian_naive_bayes(x_train, y_train, x_test, y_test):
    model = naive_bayes.GaussianNB()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    cal_metrics(y_test, y_pred, 'gaussianNB')
def train_NB(x_train, y_train):
    model = naive_bayes.GaussianNB()
    model.fit(X=x_train, y=y_train)

    return model
 def __init__(self):
     from sklearn import naive_bayes
     self.algorithm = naive_bayes.GaussianNB()
예제 #27
0
            col[h] = ['orange', 'orange', 'green', 'green', 'green']
            labels[h] = [6, 6, 2, 2, 2]
        elif i == uncertain or i == utest:
            col[h] = ['red', 'red', 'green', 'green', 'green']
            labels[h] = [7, 7, 2, 2, 2]
        h = h + 1

#sample = testdata[0:500]
#pdsample = plotdata[0:500]
print np.shape(sampletest)

#fig = pl.figure(1)
#pl.clf()

print "Gaussian Naive Bayes"
gnb = naive_bayes.GaussianNB()
for i in range(0, 5):
    if i == 1 or i == 3:
        X_train = training2
        y_train = col[100:172, i]
        X_test = sample2
        y_test = col[272:, i]
    else:
        X_train = training
        y_train = col[:172, i]
        X_test = sampletest
        y_test = col[172:, i]

    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
예제 #28
0
print(iris.target.shape)

# in order to plot data, we select first two features
iris.data = iris.data[:, :2]
# for binary classification, set label 2 as true class while others are false class
iris.target[iris.target != 2] = -1
iris.target[iris.target == 2] = 1
iris.target[iris.target == -1] = 0

# # Split the data into training/testing sets
d_train, d_test, t_train, t_test = model_selection.train_test_split(
    iris.data, iris.target, test_size=0.3, random_state=0)

# make model
prior = [0.9, 0.1]  # P(Y=0) = 0.9, P(Y=1) = 0.1
model = naive_bayes.GaussianNB(
    priors=prior)  # if priors=None, model learns prior from the data

# training model
model.fit(d_train, t_train)

# calculation for plotting grid
h = 0.01
x_min, x_max = d_train[:, 0].min() - .5, d_train[:, 0].max() + .5
y_min, y_max = d_train[:, 1].min() - .5, d_train[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

# plotting
예제 #29
0
# Cnstants
long_names = {
    "lr":"Logistic regression",
    "lsvc":"SVC with linear kernel",
    "rbfsvc":"SVC with RBF kernel",
    "nb":"Naive Bayes",
    "dt":"Decision tree",
    "rf":"Random forest",
    "et":"Extremely randomized trees"
}

classifier_dict = {
    "lr":linear_model.LogisticRegression(),
    "lsvc":svm.SVC( kernel = "linear", probability = True ),
    "rbfsvc":svm.SVC( probability = True ),
    "nb":naive_bayes.GaussianNB(),
    "dt":tree.DecisionTreeClassifier(),
    "rf":ensemble.RandomForestClassifier( n_estimators=100 ),
    "et":ensemble.ExtraTreesClassifier( n_estimators=100 )
}

normalization_dict = {
    "lr":"scaled",
    "lsvc":"scaled",
    "rbfsvc":"scaled",
    "nb":"log"
}

feature_selection_C_methods = [
    lambda C: svm.LinearSVC( C = C, penalty = "l1", dual = False),
    lambda C: linear_model.LogisticRegression( C = C, penalty = "l1" ),
예제 #30
0
                                                    random_state=4)

# decision tree with gini impurity criterion
dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini')
dtree_gini_mod.fit(x_train, y_train)
preds_gini = dtree_gini_mod.predict(x_test)
print_multiclass_classif_error_report(y_test, preds_gini)

# decision tree with entropy criterion
dtree_entropy_mod = tree.DecisionTreeClassifier(criterion='entropy')
dtree_entropy_mod.fit(x_train, y_train)
preds_entropy = dtree_entropy_mod.predict(x_test)
print_multiclass_classif_error_report(y_test, preds_entropy)

# Naive bayesian classifier
gnb_mod = naive_bayes.GaussianNB()
gnb_mod.fit(x_train, y_train)
preds = gnb_mod.predict(x_test)
print_multiclass_classif_error_report(y_test, preds)

# Best Model
data = pd.read_csv('C:\GitHub\DSCI401\data\churn_data.csv')

del data['FamilySize']
del data['CustID']

# label encoding for categorical variables
for i in data.columns:
    if data[i].dtype == 'object':
        le = preprocessing.LabelEncoder()
        le.fit(list(data[i].values))