# individual # # def individual(args): # name_cls = args[0] # wX = args[1] # wy = args[2] # return name_cls.fit(wX, wy) def individual(name_cls, wX, wy): return name_cls.fit(wX, wy) NAME_INDIVIDUALS = { 'DT' : tree.DecisionTreeClassifier(), 'NB' : naive_bayes.GaussianNB(), 'SVM' : svm.SVC(gamma='scale'), 'LSVM': svm.LinearSVC(), 'KNNu': neighbors.KNeighborsClassifier(weights='uniform'), 'KNNd': neighbors.KNeighborsClassifier(weights='distance'), 'LM1' : linear_model.SGDClassifier(penalty='l1'), 'LM2' : linear_model.SGDClassifier(penalty='l2'), } #---------------------------------------- # EnsembleVoting #----------------------------------------
#Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html ('ada', ensemble.AdaBoostClassifier()), ('bc', ensemble.BaggingClassifier()), ('etc',ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('rfc', ensemble.RandomForestClassifier()), #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc ('gpc', gaussian_process.GaussianProcessClassifier()), #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression ('lr', linear_model.LogisticRegressionCV()), #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html ('bnb', naive_bayes.BernoulliNB()), ('gnb', naive_bayes.GaussianNB()), #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html ('knn', neighbors.KNeighborsClassifier()), #SVM: http://scikit-learn.org/stable/modules/svm.html ('svc', svm.SVC(probability=True)), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html ('xgb', XGBClassifier()) ] #Hard Vote or majority rules vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
import sklearn.metrics as sm import matplotlib.pyplot as mp import mpl_toolkits.axes_grid1 as mg x, y = [], [] with open('multiple.txt', 'r') as f: for line in f.readlines(): data = [float(substr) for substr in line.split(',')] x.append(data[:-1]) y.append(data[-1]) x = np.array(x) y = np.array(y, dtype=int) l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005 b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005 train_x, test_x, train_y, test_y = ms.train_test_split( x, y, test_size=0.25, random_state=5) model = nb.GaussianNB() pc = ms.cross_val_score(model, x, y, cv=10, scoring='precision_weighted') print(round(pc.mean(), 2)) rc = ms.cross_val_score(model, x, y, cv=10, scoring='recall_weighted') print(round(rc.mean(), 2)) f1 = ms.cross_val_score(model, x, y, cv=10, scoring='f1_weighted') print(round(f1.mean(), 2)) ac = ms.cross_val_score(model, x, y, cv=10, scoring='accuracy') print(round(ac.mean(), 2)) model.fit(train_x, train_y) grid_x = np.meshgrid(np.arange(l, r, h), np.arange(b, t, v)) flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
import os from sklearn import naive_bayes, model_selection path = 'C:\\Users\\Algorithmica\\Downloads' titanic_train = pd.read_csv(os.path.join(path, 'titanic_train.csv')) print(titanic_train.shape) print(titanic_train.info()) features = ['Sex', 'Pclass', 'Embarked', 'Parch', 'SibSp'] titanic_train1 = pd.get_dummies(titanic_train, columns=['Sex', 'Pclass', 'Embarked']) X_train = titanic_train1.drop( ['PassengerId', 'Survived', 'Name', 'Age', 'Cabin', 'Ticket'], axis=1) y_train = titanic_train['Survived'] classifier = naive_bayes.GaussianNB() classifier.fit(X_train, y_train) print(classifier.class_prior_) print(classifier.sigma_) print(classifier.theta_) res = model_selection.cross_validate(classifier, X_train, y_train, cv=10) res.get('test_score').mean() res.get('train_score').mean() titanic_test = pd.read_csv(os.path.join(path, 'titanic_test.csv')) print(titanic_test.shape) print(titanic_test.info()) titanic_test.loc[titanic_test['Fare'].isnull() == True, 'Fare'] = titanic_test['Fare'].mean()
def test_GaussianNB(N=10): np.random.seed(12345) N = np.inf if N is None else N i = 1 while i < N + 1: n_ex = np.random.randint(1, 300) n_feats = np.random.randint(1, 100) n_classes = np.random.randint(2, 10) X = random_tensor((n_ex, n_feats), standardize=True) y = np.random.randint(0, n_classes, size=n_ex) X_test = random_tensor((n_ex, n_feats), standardize=True) NB = GaussianNBClassifier(eps=1e-09) NB.fit(X, y) preds = NB.predict(X_test) sklearn_NB = naive_bayes.GaussianNB() sklearn_NB.fit(X, y) sk_preds = sklearn_NB.predict(X_test) for i in range(len(NB.labels)): P = NB.parameters jointi = np.log(sklearn_NB.class_prior_[i]) jointi_mine = np.log(P["prior"][i]) np.testing.assert_almost_equal(jointi, jointi_mine) n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[i, :])) n_ij_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P["sigma"][i])) np.testing.assert_almost_equal(n_ij_mine, n_ij) n_ij2 = n_ij - 0.5 * np.sum( ((X_test - sklearn_NB.theta_[i, :])**2) / (sklearn_NB.sigma_[i, :]), 1) n_ij2_mine = n_ij_mine - 0.5 * np.sum( ((X_test - P["mean"][i])**2) / (P["sigma"][i]), 1) np.testing.assert_almost_equal(n_ij2_mine, n_ij2, decimal=4) llh = jointi + n_ij2 llh_mine = jointi_mine + n_ij2_mine np.testing.assert_almost_equal(llh_mine, llh, decimal=4) np.testing.assert_almost_equal(P["prior"], sklearn_NB.class_prior_) np.testing.assert_almost_equal(P["mean"], sklearn_NB.theta_) np.testing.assert_almost_equal(P["sigma"], sklearn_NB.sigma_) np.testing.assert_almost_equal( sklearn_NB._joint_log_likelihood(X_test), NB._log_posterior(X_test), decimal=4, ) np.testing.assert_almost_equal(preds, sk_preds) print("PASSED") i += 1
return X if __name__ == '__main__': # Carregar dataset X, y = get_dataset() # Dividir o dataset em dados de treinamento e test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Treinar classficador #clf_1 = svm.SVC() clf_1 = naive_bayes.GaussianNB() clf_1.fit(X_train, y_train) # Obter previsão y_pred = clf_1.predict(X_test) # Obter acurácia print("Acurácia do classificador NB sem informação etimológica: ") print(accuracy_score(y_test, y_pred)) X = get_etydataset(X) # Dividir o dataset em dados de treinamento e test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Treinar classficador #clf_2 = svm.SVC()
clf.fit(x_treino, y_treino) pred = clf.predict(x_teste) print('\n\nClassificador K-NN..................................') print('\n Predição:') print(pred) print('\nReal:') print(y_teste) print('\nMatriz de confusão:') print(metrics.confusion_matrix(y_teste, pred)) print('\nRelatório de classificação:') print(metrics.classification_report(y_teste, pred)) print('acurácia') print(metrics.accuracy_score(y_teste, pred)) #________________________________________________________________ clf = naive_bayes.GaussianNB() clf.fit(x_treino, y_treino) pred = clf.predict(x_teste) print('\n\nClassificador Bayes.................................') print('\n Predição:') print(pred) print('\nReal:') print(y_teste) print('\nMatriz de confusão:') print(metrics.confusion_matrix(y_teste, pred)) print('\nRelatório de classificação:') print(metrics.classification_report(y_teste, pred)) print('acurácia') print(metrics.accuracy_score(y_teste, pred)) #________________________________________________________________
def createPipeline(self): self.pipeline = Pipeline([('scaler', StandardScaler()), ('model', naive_bayes.GaussianNB())])
# offset the tick plt.gca().set_xticks(tick_marks, minor=True) plt.gca().set_yticks(tick_marks, minor=True) plt.gca().xaxis.set_ticks_position('none') plt.gca().yaxis.set_ticks_position('none') plt.grid(True, which='minor', linestyle='-') plt.gcf().subplots_adjust(bottom=0.15) plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') # show confusion matrix plt.savefig('/content/lg_confusion_matrix.png', format='png') """## Bayes""" import sklearn.naive_bayes as nb bayes = nb.GaussianNB() bayes.fit(X_train, y_train) pred=bayes.predict(X_test) print('Test accuracy',accuracy_score(y_test, pred)) # dump(lr,"bayes.joblib") cm = confusion_matrix(y_test, pred) np.set_printoptions(precision=2) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print (cm_normalized) plt.figure(figsize=(12, 8), dpi=120) ind_array = np.arange(len(labels)) x, y = np.meshgrid(ind_array, ind_array) for x_val, y_val in zip(x.flatten(), y.flatten()):
TSFV = [] for dimCounter in range(dimTS): TSFV.append(sheetFV.cell_value(TSCounter, dimCounter)) listTSFV.append(TSFV) #creating a list of Training Sample Labels (assumming they are in the first column of the worksheet) listTSLabels = [] for TSCounter in range(noTS): listTSLabels.append(sheetLabels.cell_value(TSCounter, 0)) #creating classifiers with the models: 1. Decision Tree 2. SVC 3. KNN 4. Naive Bayes Gaussian and storing them in a list 'listClfs' listClfs = [] clfDT = tree.DecisionTreeClassifier() clfSVC = svm.LinearSVC() clfKN = neighbors.KNeighborsClassifier() clfGNB = naive_bayes.GaussianNB() listClfs.extend([clfDT, clfSVC, clfKN, clfGNB]) #training the classifiers for clf in listClfs: clf.fit(listTSFV, listTSLabels) #testing the classifier on 'testInput' [LIST of test Sample FVs(themselves a list): so a list of lists] and storing the predicted labels in listPredLabels (again a list of lists) testInput = [[190, 70, 43], [160, 53, 38]] listPredLabels = [] for clf in listClfs: listPredLabels.append(clf.predict(testInput)) #using the method 'sklearn.metrics.accuracy_score' to evaluate the classifier performance; normalize = True(default) => % score (remem 'T' capital in 'True') else absolute no of correct classifications trueLabelsTestInput = ['male', 'female'] for predLabels in listPredLabels:
plt.scatter(qualified_candidates["technical_grade"], qualified_candidates["english_grade"], color="w") plt.scatter(unqualified_candidates["technical_grade"], unqualified_candidates["english_grade"], color="k") qualifies_double_grade_df = pd.read_csv("data/double_grade_reevaluated.csv") X = qualifies_double_grade_df[["technical_grade", "english_grade"]] y = qualifies_double_grade_df["qualifies"] sns.pairplot(qualifies_double_grade_df, hue="qualifies") k_folds = ms.StratifiedKFold(n_splits=4, shuffle=True) naive_bayes_model = sk_naive_bayes.GaussianNB() cv_predictions = ms.cross_val_predict(naive_bayes_model, X, y, cv=k_folds) confusion_matrix = metrics.confusion_matrix(y, cv_predictions) print(confusion_matrix) naive_bayes_model.fit(X, y) plt.figure() plot_model(naive_bayes_model, qualifies_double_grade_df) plt.show()
def __init__(self, ): super().__init__(name="GaussianNB") self.clf = naive_bayes.GaussianNB()
def __init__(self, data, labels, training_set_ratio): super().__init__(data, labels, training_set_ratio) self.name = 'Naiwny klasyfikator Bayesa' self.short_name = 'Bayes' self.params_string = '' self.model = naive_bayes.GaussianNB()
from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import StratifiedKFold, train_test_split import matplotlib.pyplot as plt from data import importdata import numpy as np from sklearn import naive_bayes from sklearn import tree dataset = ['load_german', 'load_haberman', 'load_transfusion', 'load_ionosphere', 'load_balance_scale', 'load_bupa', 'load_car', 'load_cmc', 'load_ecoli', 'load_glass', 'load_new_thyroid', 'load_seeds', 'load_solar_flare', 'load_vehicle', 'load_vertebal', 'load_yeastME1', 'load_yeastME2', 'load_yeastME3', 'load_abalone0_4', 'load_abalone16_29', 'load_abalone0_4_16_29'] db = getattr(importdata, dataset[20])() pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', naive_bayes.GaussianNB())]) pipe_lr2 = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0, C=1.0))]) X_train, X_test, y_train, y_test = train_test_split(db.data, db.target, test_size=0.3, stratify=db.target, random_state=5) fig = plt.figure(figsize=(7, 5), facecolor='white') mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] probas = pipe_lr.fit(X_train,
ohe.fit(titanic[ohe_features]) print(ohe.n_values_) tmp1 = ohe.transform(titanic[ohe_features]).toarray() features = ['Age', 'Fare', 'Parch' , 'SibSp', 'FamilySize'] tmp2 = titanic[features].values tmp = np.concatenate((tmp1,tmp2), axis=1) X_train = tmp[:titanic_train.shape[0]] y_train = titanic_train['Survived'] sns.distplot(X_train[:,33], hist=False) nb_estimator = naive_bayes.GaussianNB() nb_estimator.fit(X_train, y_train) print(nb_estimator.class_prior_) print(nb_estimator.sigma_) print(nb_estimator.theta_) res = model_selection.cross_validate(nb_estimator, X_train, y_train, cv=10) print(res.get('test_score').mean()) print(nb_estimator.score(X_train, y_train)) X_test = tmp[titanic_train.shape[0]:] titanic_test['Survived'] = nb_estimator.predict(X_test) titanic_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["PassengerId", "Survived"], index=False)
from sklearn import tree, ensemble, svm, naive_bayes from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import confusion_matrix, accuracy_score import numpy as np from scipy import stats if __name__ == '__main__': features = np.loadtxt("data/w2v/w2v_embeddings.txt") classes = np.loadtxt("data/w2v/w2v_labels.txt").astype(int) nbc = naive_bayes.GaussianNB() nbc_scores = cross_val_score(nbc, features, classes, cv=5) print('Naive bayes mean accuracy : %.2f' % (nbc_scores.mean())) rfc = ensemble.RandomForestClassifier() rfc_scores = cross_val_score(rfc, features, classes, cv=5) print('Random forest mean accuracy : %.2f' % (rfc_scores.mean())) lrc = LogisticRegression() lrc_scores = cross_val_score(lrc, features, classes, cv=5) print('LR mean accuracy : %.2f' % (lrc_scores.mean()))
def spot_check(task, seed, splits, df, y): results = {} seed = seed kfold = KFold(n_splits=splits, random_state=seed) if task in ('c', 'C'): print("Spot Checking Classification Algorithms: ") #Classification Models! #LINEAR Classification models: #Logistic Regression model = linear_model.LogisticRegression() result = cross_val_score(model, df, y, cv=kfold) results['LoR'] = result.mean() print("Linear-\nLogistic Regression: ", results['LoR']) #LDA model = discriminant_analysis.LinearDiscriminantAnalysis() result = cross_val_score(model, df, y, cv=kfold) results['LDA'] = result.mean() print("LDA score: ", results['LDA'], end="\n\n") #NON-LINEAR Classification models: #KNN model = neighbors.KNeighborsClassifier( ) #Careful of the spelling of Neighbors result = cross_val_score(model, df, y, cv=kfold) results['KNNC'] = result.mean() print("Non-Linear-\nKNN: ", results['KNNC']) #Naive Bayes model = naive_bayes.GaussianNB() result = cross_val_score(model, df, y, cv=kfold) results['NBayes'] = result.mean() print("Naive Bayes: ", results['NBayes']) #Classification and Regression Trees / decision trees model = tree.DecisionTreeClassifier() result = cross_val_score(model, df, y, cv=kfold) results['CARTC'] = result.mean() print("CART: ", results['CARTC']) #Support Vector Machines model = svm.SVC() result = cross_val_score(model, df, y, cv=kfold) results['SVC'] = result.mean() print("Support Vector Machine: ", results['SVC']) elif task in ('r', 'R'): print("Spot Checking Regression Algorithms: ") scoring = 'neg_mean_squared_error' #Regression Models #LINEAR Regression Models #Linear Regression model = linear_model.LinearRegression() result = cross_val_score(model, df, y, cv=kfold, scoring=scoring) results['LiR'] = result.mean() print("Linear-\nLinear Regression: ", results['LiR']) #Ridge Regression (L2 norm) model = linear_model.Ridge() result = cross_val_score(model, df, y, cv=kfold, scoring=scoring) results['RR'] = result.mean() print("Ridge Regression: ", results['RR']) #Least Absolute Shrinkage and Selection Operator (L1 norm) model = linear_model.Lasso() result = cross_val_score(model, df, y, cv=kfold, scoring=scoring) results['LASSO'] = result.mean() print("LASSO: ", results['LASSO']) #ElasticNet Regression (L1 and L2 norm) model = linear_model.ElasticNet() result = cross_val_score(model, df, y, cv=kfold, scoring=scoring) results['ENet'] = result.mean() print("ElasticNet Regression: ", results['ENet']) #NON LINEAR Regression models #K-Nearest Neighbours model = neighbors.KNeighborsRegressor() result = cross_val_score(model, df, y, cv=kfold, scoring=scoring) results['KNNR'] = result.mean() print("Non-Linear-\nKNN: ", results['KNNR']) #Classification and Regression Trees model = tree.DecisionTreeRegressor() result = cross_val_score(model, df, y, cv=kfold, scoring=scoring) results['CARTR'] = result.mean() print("CART: ", results['CARTR']) #Support Vector Machine model = svm.SVR() result = cross_val_score(model, df, y, cv=kfold, scoring=scoring) results['SVR'] = result.mean() print("Support Vector Machine: ", results['SVR']) else: print("Invalid task definition (r/c)!") #Select top three in spot checked algorithms res = [] for i in range(3): res.append(max(results.items(), key=op.itemgetter(1))[0]) del results[res[i]] return res
''' for dataset in kk.getDatasets(): cnt = cnt+1 print ('Dataset Name:',dataset.DESCR[0:10]) X = dataset.data Y = dataset.target X = kk.MeanNormalizer(X) n_comp = X.shape[1] - cnt if n_comp <= 0: n_comp = X.shape[1] pca.set_params(n_components = n_comp) X = pca.fit_transform(X) print('Score:',kk.fitModel(bayes, X, Y)) del X,Y ''' gauss = skn.GaussianNB() bernoulli =skn.BernoulliNB() multi = skn.MultinomialNB() for dataset in kk.getDatasets(binary=True): #dataset = skd.load_breast_cancer() print ('Dataset Name:',dataset.DESCR[0:20],'\n=================================') X = dataset.data + np.random.random(size=(dataset.data.shape)) Y = dataset.target if(len(np.unique(Y)) > 2): bernoulli.set_params(binarize=True) else: print('Binary Classification') X = kk.MeanNormalizer(X) pca.set_params(n_components=np.random.randint(1, X.shape[1]+1)) X = pca.fit_transform(X) kk.fitModel(bayes,X,Y)
testY = numpy.array(multinomialNB.predict(x)) trainY = numpy.array(y) ErrorY = numpy.array(numpy.subtract(testY, trainY)) error = 0 for k in range(0, ErrorY.size, 1): if ErrorY[k] != 0: error = error + 1 sizeOfErrorY = ErrorY.size error = float(float(error) / float(sizeOfErrorY)) error = error * float(100) print "Multinomial NB Percentage of error on training set is " + str(error) #Multinomial Naive Bayes Classification guassianNB = naive_bayes.GaussianNB() guassianNB.fit(x, y) testY = numpy.array(guassianNB.predict(x)) trainY = numpy.array(y) ErrorY = numpy.array(numpy.subtract(testY, trainY)) error = 0 for k in range(0, ErrorY.size, 1): if ErrorY[k] != 0: error = error + 1 sizeOfErrorY = ErrorY.size error = float(float(error) / float(sizeOfErrorY)) error = error * float(100) print "Guassian NB Percentage of error on training set is " + str(error) #Guassian Naive Bayes Classification
]] clfs = {} #clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name':'LogisticRegression'} #clfs['rf'] = {'clf': ensemble.RandomForestClassifier(n_estimators=750, n_jobs=-1), 'name':'RandomForest'} clfs['tr'] = {'clf': tree.DecisionTreeClassifier(), 'name': 'DecisionTree'} #clfs['knn'] = {'clf': neighbors.KNeighborsClassifier(n_neighbors=4), 'name':'kNearestNeighbors'} #clfs['svc'] = {'clf': svm.SVC(kernel='linear'), 'name': 'SupportVectorClassifier'} clfs['nusvc'] = {'clf': svm.NuSVC(), 'name': 'NuSVC'} clfs['linearsvc'] = {'clf': svm.LinearSVC(), 'name': 'LinearSVC'} clfs['SGD'] = {'clf': linear_model.SGDClassifier(), 'name': 'SGDClassifier'} clfs['GPC'] = { 'clf': gaussian_process.GaussianProcessClassifier(), 'name': 'GaussianProcess' } clfs['nb'] = {'clf': naive_bayes.GaussianNB(), 'name': 'GaussianNaiveBayes'} clfs['bag'] = { 'clf': ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5, max_features=0.5), 'name': "BaggingClassifier" } clfs['gbc'] = { 'clf': ensemble.GradientBoostingClassifier(), 'name': 'GradientBoostingClassifier' } #clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(100,100,100), alpha=1e-5, solver='lbfgs', max_iter=500), 'name': 'MultilayerPerceptron'} parameters = { 'solver': ['lbfgs'],
def naive_bayes_model(x_train, y_train, x_test, y_test): model = naive_bayes.GaussianNB() model.fit(x_train, y_train) y_pred = model.predict(x_test) cal_scores(y_pred, y_test, 'GaussianNB')
def initialize_models(model_type, parameters, n_cores = -1): """ Function initializes list of models Inputs: model_type -- model type parameters -- hyperparameters dictionary n_cores -- number of cores to us, default 1 Results: Initialized model """ if model_type == "RandomForest": return ensemble.RandomForestClassifier( n_estimators=parameters['n_estimators'], max_features=parameters['max_features'], criterion=parameters['criterion'], max_depth=parameters['max_depth'], min_samples_split=parameters['min_samples_split'], random_state=parameters['random_state'], n_jobs=n_cores) elif model_type == 'ScaledLogisticRegression': return ScaledLogisticRegression( C=parameters['C_reg'], penalty=parameters['penalty'], random_state=parameters['random_state'], n_jobs = n_cores) elif model_type == "RandomForestBagging": #TODO Make Model Bagging return ensemble.BaggingClassifier( ensemble.RandomForestClassifier( n_estimators=parameters['n_estimators'], max_features=parameters['max_features'], criterion=parameters['criterion'], max_depth=parameters['max_depth'], min_samples_split=parameters['min_samples_split'], random_state=parameters['random_state'], n_jobs=n_cores), #Bagging parameters n_estimators=parameters['n_estimators_bag'], max_samples=parameters['max_samples'], max_features=parameters['max_features_bag'], bootstrap=parameters['bootstrap'], bootstrap_features=parameters['bootstrap_features'], n_jobs=n_cores) elif model_type == "RandomForestBoosting": #TODO Make Model Boosting return ensemble.AdaBoostClassifier( ensemble.RandomForestClassifier( n_estimators=parameters['n_estimators'], max_features=parameters['max_features'], criterion=parameters['criterion'], max_depth=parameters['max_depth'], min_samples_split=parameters['min_samples_split'], random_state=parameters['random_state'], n_jobs=n_cores), #Boosting parameters learning_rate=parameters['learning_rate'], algorithm=parameters['algorithm'], n_estimators=parameters['n_estimators_boost']) elif model_type == 'SVM': return svm.SVC(C=parameters['C_reg'], kernel=parameters['kernel'], probability=True, random_state=parameters['random_state']) elif model_type == 'LogisticRegression': return linear_model.LogisticRegression( C=parameters['C_reg'], penalty=parameters['penalty'], random_state=parameters['random_state']) elif model_type == 'AdaBoost': return ensemble.AdaBoostClassifier( learning_rate=parameters['learning_rate'], algorithm=parameters['algorithm'], n_estimators=parameters['n_estimators'], random_state=parameters['random_state']) elif model_type == 'ExtraTrees': return ensemble.ExtraTreesClassifier( n_estimators=parameters['n_estimators'], max_features=parameters['max_features'], criterion=parameters['criterion'], max_depth=parameters['max_depth'], min_samples_split=parameters['min_samples_split'], random_state=parameters['random_state'], n_jobs=n_cores) elif model_type == 'GradientBoostingClassifier': return ensemble.GradientBoostingClassifier( n_estimators=parameters['n_estimators'], learning_rate=parameters['learning_rate'], subsample=parameters['subsample'], max_depth=parameters['max_depth'], random_state=parameters['random_state']) elif model_type == 'GaussianNB': return naive_bayes.GaussianNB() elif model_type == 'DecisionTreeClassifier': return tree.DecisionTreeClassifier( max_features=parameters['max_features'], criterion=parameters['criterion'], max_depth=parameters['max_depth'], min_samples_split=parameters['min_samples_split'], random_state=parameters['random_state']) elif model_type == 'SGDClassifier': return linear_model.SGDClassifier( loss=parameters['loss'], penalty=parameters['penalty'], random_state=parameters['random_state'], n_jobs=n_cores) elif model_type == 'KNeighborsClassifier': return neighbors.KNeighborsClassifier( n_neighbors=parameters['n_neighbors'], weights=parameters['weights'], algorithm=parameters['algorithm'], n_jobs=n_cores) else: raise ConfigError("Sorry, unsupported model {}".format(model_type))
#
def gaussian_naive_bayes(x_train, y_train, x_test, y_test): model = naive_bayes.GaussianNB() model.fit(x_train, y_train) y_pred = model.predict(x_test) cal_metrics(y_test, y_pred, 'gaussianNB')
def train_NB(x_train, y_train): model = naive_bayes.GaussianNB() model.fit(X=x_train, y=y_train) return model
def __init__(self): from sklearn import naive_bayes self.algorithm = naive_bayes.GaussianNB()
col[h] = ['orange', 'orange', 'green', 'green', 'green'] labels[h] = [6, 6, 2, 2, 2] elif i == uncertain or i == utest: col[h] = ['red', 'red', 'green', 'green', 'green'] labels[h] = [7, 7, 2, 2, 2] h = h + 1 #sample = testdata[0:500] #pdsample = plotdata[0:500] print np.shape(sampletest) #fig = pl.figure(1) #pl.clf() print "Gaussian Naive Bayes" gnb = naive_bayes.GaussianNB() for i in range(0, 5): if i == 1 or i == 3: X_train = training2 y_train = col[100:172, i] X_test = sample2 y_test = col[272:, i] else: X_train = training y_train = col[:172, i] X_test = sampletest y_test = col[172:, i] gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test)
print(iris.target.shape) # in order to plot data, we select first two features iris.data = iris.data[:, :2] # for binary classification, set label 2 as true class while others are false class iris.target[iris.target != 2] = -1 iris.target[iris.target == 2] = 1 iris.target[iris.target == -1] = 0 # # Split the data into training/testing sets d_train, d_test, t_train, t_test = model_selection.train_test_split( iris.data, iris.target, test_size=0.3, random_state=0) # make model prior = [0.9, 0.1] # P(Y=0) = 0.9, P(Y=1) = 0.1 model = naive_bayes.GaussianNB( priors=prior) # if priors=None, model learns prior from the data # training model model.fit(d_train, t_train) # calculation for plotting grid h = 0.01 x_min, x_max = d_train[:, 0].min() - .5, d_train[:, 0].max() + .5 y_min, y_max = d_train[:, 1].min() - .5, d_train[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) # plotting
# Cnstants long_names = { "lr":"Logistic regression", "lsvc":"SVC with linear kernel", "rbfsvc":"SVC with RBF kernel", "nb":"Naive Bayes", "dt":"Decision tree", "rf":"Random forest", "et":"Extremely randomized trees" } classifier_dict = { "lr":linear_model.LogisticRegression(), "lsvc":svm.SVC( kernel = "linear", probability = True ), "rbfsvc":svm.SVC( probability = True ), "nb":naive_bayes.GaussianNB(), "dt":tree.DecisionTreeClassifier(), "rf":ensemble.RandomForestClassifier( n_estimators=100 ), "et":ensemble.ExtraTreesClassifier( n_estimators=100 ) } normalization_dict = { "lr":"scaled", "lsvc":"scaled", "rbfsvc":"scaled", "nb":"log" } feature_selection_C_methods = [ lambda C: svm.LinearSVC( C = C, penalty = "l1", dual = False), lambda C: linear_model.LogisticRegression( C = C, penalty = "l1" ),
random_state=4) # decision tree with gini impurity criterion dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini') dtree_gini_mod.fit(x_train, y_train) preds_gini = dtree_gini_mod.predict(x_test) print_multiclass_classif_error_report(y_test, preds_gini) # decision tree with entropy criterion dtree_entropy_mod = tree.DecisionTreeClassifier(criterion='entropy') dtree_entropy_mod.fit(x_train, y_train) preds_entropy = dtree_entropy_mod.predict(x_test) print_multiclass_classif_error_report(y_test, preds_entropy) # Naive bayesian classifier gnb_mod = naive_bayes.GaussianNB() gnb_mod.fit(x_train, y_train) preds = gnb_mod.predict(x_test) print_multiclass_classif_error_report(y_test, preds) # Best Model data = pd.read_csv('C:\GitHub\DSCI401\data\churn_data.csv') del data['FamilySize'] del data['CustID'] # label encoding for categorical variables for i in data.columns: if data[i].dtype == 'object': le = preprocessing.LabelEncoder() le.fit(list(data[i].values))