def get_model(choice='lr', class_weight=None): if choice == 'svc': model = svc(verbose=1, class_weight=class_weight, n_jobs=-1) elif choice == 'lsvc': model = lsvc(class_weight=class_weight, n_jobs=-1) elif choice == 'knn': model = KNeighborsClassifier() elif choice == 'msvm': model = MulticlassSVM(C=0.1, tol=0.01, max_iter=100, random_state=0, verbose=1) elif choice == 'gnb': model = gnb(class_weight=class_weight) elif choice == 'gpc': model = gpc(class_weight=class_weight) elif choice == 'sgdc': model = sgdc(class_weight=class_weight) elif choice == 'rf': model = rf(class_weight=class_weight) # elif choice == 'vw': # model = vw() else: model = lr(class_weight=class_weight) return model
def _gnb(t, min_freq, save=False): if save: clf = gnb().fit(records, labels) save_classifier(clf, t, 'gnb', min_freq) return ('gnb', clf) else: clf = load_classifier(t, 'gnb', min_freq) return ('gnb', clf)
def test_step(index, train_df, train_labels, test_df, train_step): clf = gnb() train_data = train_df[index].values test_data = test_df[index].values clf.fit(train_data, train_labels) score = clf.score(test_data, test_labels) print('List length: %d' % len(index)) print('The index list is: ', index) print('The score is: %.3f ' % score) return score, clf
def train_model(ft, lbl): from sklearn.naive_bayes import GaussianNB as gnb model = gnb() #model = rfc(random_state=0) #model = knn(n_neighbors=10, weights='distance', p=2) ft[:, 0:2] = np.zeros((ft.shape[0], 2)) ft[:, 3:4] = np.zeros((ft.shape[0], 1)) print ft model.fit(ft, lbl) return model
def main(): "main program" app = get_app_title() appf = get_app_file() loans_df, loans_y, test_df, test_y, numeric_vars = load_data() indep_vars = numeric_vars print("numeric_vars\n", numeric_vars) plotdir = make_plotdir() loans_X = loans_df test_X = test_df clf = gnb() # skip scaling for now, score 87% do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y) loans_X, my_scaler = scale_train_data(loans_df, print_out=True) test_X = scale_test_data(my_scaler, test_df) clf = gnb() # add scaling, score 87% do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allscale", indep_vars, test_df, test_y, pred_y) # gnb has no meta-parameters to explore, optimize loans_X = loans_df test_X = test_df clf = gnb() # score 84% +- 4% cross_validate(clf, loans_X, loans_y, print_out=True) clf = gnb() # best score 89% +- 4% opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=False) # redo with optimized columns loans_X = loans_df[opt_list] test_X = test_df[opt_list] clf = gnb() # best score 89% +- 4% cross_validate(clf, loans_X, loans_y, print_out=True) clf = gnb() # fit score 89%, predict score 91% do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
def hoursperweek_test(x): if(x<50): return(0) else: return(1) def capitalgain_test(x): if(x==0): return(0) else: return(1) data_test['age'],_=pd.factorize(data_test['age'].apply(age_test)) data_test['hoursperweek'],_=pd.factorize(data_test['hoursperweek'].apply(hoursperweek_test)) data_test['capitalgain'],_=pd.factorize(data_test['capitalgain'].apply(capitalgain_test)) data_test=data_test.drop('educationno',axis=1) data_test=data_test.drop('capitalgain',axis=1) x_train=data_train.drop("Salary",axis=1) y_train=data_train["Salary"] x_test=data_test.drop("Salary",axis=1) y_test=data_test["Salary"] model=gnb().fit(x_train,y_train) prediction=model.predict(x_test) #accuracy np.mean(y_test==prediction)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting classifier to the Training set # Create your classifier here from sklearn.naive_bayes import GaussianNB as gnb classifier = gnb() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1,
train, test = train_test_split(data_mod, test_size=0.2) # X = diabetes[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] # y = diabetes[['Outcome']] # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2) print(data_mod.shape) print(train.shape) print(test.shape) features = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Age', 'Insulin', 'DiabetesPedigreeFunction' ] target = 'Outcome' classifiers = [knnc(), dtc(), SVC(gamma='auto'), SVC(kernel='linear'), gnb()] classifier_names = [ 'K nearest neighbors', 'Decision Tree Classifier', 'SVM classifier with RBF kernel', 'SVM classifier with linear kernel', 'Gaussian Naive Bayes' ] for clf, clf_name in zip(classifiers, classifier_names): cv_scores = cross_val_score(clf, train[features], train[target], cv=5) print(clf_name, ' mean accuracy: ', round(cv_scores.mean() * 100, 3), '% std: ', round(cv_scores.var() * 100, 3), '%') final_model_smv_lin = SVC(kernel='linear', probability=True).fit(train[features], train[target]) # final_model_gnb = gnb().fit(train[features], train[target])
predictions = [] for classifier in f: scores = classifier.predict_proba(x)[:, 0] print(scores) predictions.append(scores) predictions = np.array(predictions).T predictions = [guess.tolist().index(max(guess)) for guess in predictions] pos = [int(predict == trueval) for predict, trueval in zip(predictions, y)] acc = sum(pos) / len(pos) print(acc) return data = (np.loadtxt('smarthome.csv', delimiter=',')) x, y = split_data(data) reg_stats = show_stats(x, y, gnb(), 10, 'Basic Gaussian Naive Bayes') disc = discretize_interval(data) x, y = split_data(disc) disc_stats = show_stats(x, y, gnb(), 10, "Discretized Naive Bayes") tval, pval = ttest_ind(reg_stats['test_score'], disc_stats['test_score']) print("tval", tval, "pval", pval) x_train = x[0:int(len(x) * 2 / 3)] x_test = x[int(len(x) * 2 / 3):-1] y_train = y[0:int(len(y) * 2 / 3)] y_test = y[int(len(x) * 2 / 3):-1] e = one_vs_all_train(x_train, y_train, gnb(), range(8)) one_vs_all_test(x_test, y_test, e)
from sklearn.naive_bayes import GaussianNB as gnb, BernoulliNB as bnb with open('train.json') as data: train = json.load(data) cuisine = [] ingredients = [] for i in train: cuisine.append(i["cuisine"]) ingredients.extend(i["ingredients"]) singredients = list(set(ingredients)) traind = [] d = {singredients[i]: i for i in range(len(singredients))} for i in train: row = [0] * len(singredients) for j in i["ingredients"]: row[d[j]] = 1 traind.append(row) k_fold = kf(n_splits=3) ga = cvs(gnb(), traind, cuisine, cv=k_fold, n_jobs=-1) ba = cvs(bnb(), traind, cuisine, cv=k_fold, n_jobs=-1) f = open('2d', 'wb') s = "Gaussian accuracy is: " + str(np.mean(ga)) print s f.write(s) s = "Bernoulli accuracy is: " + str(np.mean(ba)) print s f.write(s)
### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB as gnb #from sklearn.linear_model import LogisticRegression as lr from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.ensemble import AdaBoostClassifier as abc from sklearn.ensemble import GradientBoostingClassifier as gbc #from sklearn.svm import SVC as svc clf1 = gnb() #clf2 = lr() clf3 = rfc() clf4 = abc() clf5 = gbc() #clf6 = svc() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split
# ##################################################### import pandas as pd if __name__ == '__main__': train_features = pd.read_csv('train/train_features.csv') columns = train_features.columns[3:] train_labels = train_features["drop"].values train_features = train_features[columns].values from sklearn.naive_bayes import GaussianNB as gnb clf = gnb() from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split score = cross_val_score(clf, train_features, train_labels, cv=5, scoring='accuracy') print score #X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.4, random_state=1) #clf.fit(train_features, train_labels) #clf.fit(X_train, y_train) #print clf.score(X_test, y_test) #print "fit susscess" del train_features del train_labels """
import numpy as np from sklearn.naive_bayes import GaussianNB as gnb from sklearn.metrics import accuracy_score from time import time X = np.array([[2, 5], [3, 6], [1, 7], [1, 2], [4, 3], [6, 8], [7, 3], [6, 1], [8, 7], [9, 3]]) Y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]) startTime = time() clf = gnb() clf.fit(X, Y) pred = clf.predict([[0, 1]]) print(pred) testX = np.array([[1, 9], [3, 1], [4, 7], [6, 5], [5, 5], [7, 9]]) testY = np.array([1, 1, 1, 2, 2, 2]) pred = clf.predict(testX) print("Accuracy ", accuracy_score(testY, pred) * 100) print("Time ", round(time() - startTime, 3), "sec")
scores.append(sum(stats['test_score']) / len(stats['test_score'])) for score in scores: print('monk', i, ':', score) print('-----------------------------') return sum(scores) / len(scores) x1, y1 = read_data("monks-1.csv") x2, y2 = read_data("monks-2.csv") x3, y3 = read_data("monks-3.csv") feats = [x1, x2, x3] labs = [y1, y2, y3] print('***Using 3-fold validation***') worst = show_stats(feats, labs, pct(max_iter=100, tol=0), 3, 'perceptron') best = show_stats(feats, labs, dt(max_depth=10), 3, 'decision tree') show_stats(feats, labs, knn(n_neighbors=3), 3, 'K-nearest-neighbors') show_stats(feats, labs, gnb(), 3, 'Gaussian Naive Bayes') print('t test between perceptron and decision tree:', ttest_ind(worst, best)) print('***Using Leave-one-out***') worst = show_stats(feats, labs, pct(max_iter=50, tol=0), loo(), 'perceptron') best = show_stats(feats, labs, dt(max_depth=10), loo(), 'decision tree') show_stats(feats, labs, knn(n_neighbors=3), loo(), 'K-nearest-neighbors') show_stats(feats, labs, gnb(), loo(), 'Gaussian Naive Bayes') print('t test between perceptron and decision tree:', ttest_ind(worst, best))
def predictByGNB(features, classes, test): #No parameters need clf = gnb() clf.fit(features, classes) return clf.predict(test)
print("LABEL:") print(labels.head()) feature_train,feature_test,label_train,label_test=train_test_split(featuers,labels,test_size=0.3,train_size=0.7) # 1.Using KNN Classification model = KNeighborsClassifier(5) # The knearest k=5 model.fit(feature_train, label_train) predicts=model.predict(feature_test) print("PREDICT RESULT Using KNN:",predicts) accuracy = accuracy_score(label_test, predicts) print('Accuracy of KNN classifier :',accuracy) # =82% # 2.Using Naive Bayes Classification model = gnb() model.fit(feature_train, label_train) predicts=model.predict(feature_test) print("PREDICT RESULT Using Naive Bayes:",predicts) accuracy = accuracy_score(label_test, predicts) print('Accuracy of Naive Bayes classifier :',accuracy) # =82% # 3.Using Decision Tree Induction Classification # First we need to convert continuous values into categorial as much as we can print(max(featuers.age),min(featuers.age)) # to know pins range #70 #32 ageCategory=pd.cut(featuers.age,bins=[0,17,32,65,100],labels=['child','teenager','adult','elderly']) cigsPerDayCategory=pd.cut(featuers.cigsPerDay,bins=[-1,2.0,5.0,7.0,20.0],labels=['low','medium','high','veryHigh']) featuers.insert(2,"ageCategory",ageCategory) featuers.insert(4,"cigsPerDayCategory",cigsPerDayCategory) # Now after adding a categorial columns we need to drop continuous values columns
def naive_bayes_speed_test(dftrain, dftrain_y, plotdir): atitle = 'Naive Bayes' afile = 'nbayes' clf = gnb() # speed_test_medium(clf, dftrain, dftrain_y, atitle, afile, plotdir) speed_test_large(clf, dftrain, dftrain_y, atitle, afile, plotdir)
] target = ['defects'] # print(df) # print(df.shape) # print(train[features].shape) # print(train[target].shape) # print(train[target].values.ravel().shape) # # print(test.shape) # Y = train[target].values.reshape(train[target].shape[0]) # print(Y) classifiers = [knnc(), dtc(), SVC(), SVC(kernel='linear'), gnb()] classifier_names = [ 'K nearest neighbors', 'Decision Tree Classifier', 'SVM classifier with RBF kernel', 'SVM classifier with linear kernel', 'Gaussian Naive Bayes' ] # for clf, clf_name in zip(classifiers, classifier_names): # cv_scores = cross_val_score(clf, train[features], train[target], cv=5) # print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), \ # '% std: ', round(cv_scores.var()*100, 3),'%') # final_model_smv_lin = SVC(kernel='linear').fit(train[features], Y) final_model_gnb = gnb().fit(train[features], train[target])
import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB as gnb from sklearn.metrics import accuracy_score data = pd.read_csv('pima-indians-diabetes.csv') print data.describe() features = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Age', 'Insulin', 'DiabetesPedigreeFunction' ] target = 'Class' train, test = train_test_split(data, test_size=0.2) clf = gnb().fit(train[features], train[target]) y_predicted = clf.predict(test[features]) print "Accuracy ", round(accuracy_score(test[target], y_predicted) * 100, 2), " %" #///////////////////////OUTPUT////////////////////////////////////////////////////// # chanchald@chanchald-X553SA:~$ cd Desktop # chanchald@chanchald-X553SA:~/Desktop$ cd p2 # chanchald@chanchald-X553SA:~/Desktop/p2$ python script.py # Pregnancies Glucose BloodPressure SkinThickness BMI \ # count 768.000000 768.000000 768.000000 768.000000 768.000000 # mean 3.845052 120.894531 69.105469 20.536458 79.799479 # std 3.369578 31.972618 19.355807 15.952218 115.244002 # min 0.000000 0.000000 0.000000 0.000000 0.000000 # 25% 1.000000 99.000000 62.000000 0.000000 0.000000 # 50% 3.000000 117.000000 72.000000 23.000000 30.500000 # 75% 6.000000 140.250000 80.000000 32.000000 127.250000 # max 17.000000 199.000000 122.000000 99.000000 846.000000
get_dummies('Parch') #Replacing single missing fare with mean test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True) #Standard Scaling #Scales the data so that it has mean 0 and variance 1 #Removing the mean and scaling to unit variance from sklearn.preprocessing import StandardScaler sc = StandardScaler() train_data['Fare'] = sc.fit_transform(train_data[['Fare']]) test_data['Fare'] = sc.fit_transform(test_data[['Fare']]) from sklearn.model_selection import train_test_split as tts x = train_data.drop(['PassengerId','Survived'],axis=1) y = train_data['Survived'] x_train,x_test,y_train,y_test = tts(x,y,test_size=0.2,random_state=0) #Gaussian Naive Bayes from sklearn.naive_bayes import GaussianNB as gnb from sklearn.metrics import accuracy_score as acc gaussian = gnb() gaussian.fit(x_train,y_train) y_pred = gaussian.predict(x_test) acc_gn = round(acc(y_pred,y_test)*100,2) print("Accuracy Using Naive Bayes ",acc_gn)
dataset2 = pd.concat([dataset2, Embarked_dum], axis=1) dataset2 = dataset2.drop("Sex", axis=1) dataset2 = dataset2.drop("Embarked", axis=1) x = dataset2.iloc[:, [2, 4, 5, 6, 8, 10, 11, 12, 13]].values y = dataset2.iloc[:, [1]].values x = x.astype(float) import statsmodels.api as st x = np.append(arr=np.ones((889, 1)).astype(int), values=x, axis=1) x_o = x[:, [0, 1, 2, 3, 6, 7, 8, 9]] reg_OLS = st.OLS(endog=y, exog=x_o).fit() reg_OLS.summary() from sklearn.model_selection import train_test_split as tts x_train, x_test, y_train, y_test = tts(x_o, y, test_size=0.2, random_state=23) from sklearn import preprocessing mms = preprocessing.MinMaxScaler(feature_range=(0, 1)) x_train_mms = mms.fit_transform(x_train) x_test_mms = mms.fit_transform(x_test) from sklearn.naive_bayes import GaussianNB as gnb cf = gnb() cf.fit(x_train_mms, y_train) y_pre = cf.predict(x_test_mms) from sklearn import metrics print(metrics.accuracy_score(y_test, y_pre))
from sklearn.decomposition import PCA from sklearn.pipeline import FeatureUnion from sklearn.feature_selection import SelectKBest selection = SelectKBest(k=10) X_new = selection.fit(X_poly[:72325, :], Y).transform(X_poly) X = X_new[:72325, :] X_sub = X_new[72325:, :] ### Classifiers KNC3 = KNeighborsClassifier(n_neighbors=5) SVM = svm.SVC(probability=True) GNB = gnb() DT = DecisionTreeClassifier(criterion='gini', random_state=1) GBC = GradientBoostingClassifier(n_estimators=8000, loss='deviance') MPL = MLPClassifier(alpha=1e-5, activation='relu', random_state=1, hidden_layer_sizes=(100, 100, 100, 100)) RFC = RandomForestClassifier(n_estimators=300, criterion='gini', random_state=1, oob_score=True) SGD = SGDClassifier() agg = VotingClassifier(estimators=[('SVM', SVM), ('GNB', GNB), ('RFC', RFC)], voting='soft', weights=[1, 1, 1])
return grid.best_estimator_ # try logistic regression lr = lrc(random_state=seed) params = {"C": [10000, 15000]} lr1 = simple_gridsearch(lr, params) # try random forest rf = RandomForestClassifier(random_state=seed) params = {'n_estimators': [250, 500]} rf1 = simple_gridsearch(rf, params) # try gaussian bayes t0 = time.time() nb = gnb() Xtrain, Xtest, ytrain, ytest = train_test_split(X_train, y) nb.fit(Xtrain, ytrain) prediction = nb.predict(Xtest) print roc_auc_score(ytest, prediction) print 'timeused: ', time.time() - t0 # try KNN knn = KNeighborsClassifier() params = {"n_neighbors": [5, 10, 15, 20]} knn1 = simple_gridsearch(knn, params) # try gradient boosting gb = GradientBoostingClassifier(random_state=seed) params = {"n_estimators": [100, 250, 500]} gb1 = simple_gridsearch(gb, params)
import pandas as pd import regression as reg from sklearn.naive_bayes import GaussianNB as gnb N = 20000 train_data = pd.read_csv("train.csv") test_data = pd.read_csv("test.csv") train_data = train_data.drop(['url'], axis=1) #remove 'url' information. train_data = train_data.drop(['timedelta'], axis=1) #remove 'url' information. X = np.array(train_data.drop(['shares'], axis=1)) y = np.array(train_data['shares']) #This is the target XTrain = X[:N,:] #use the first N samples for training yTrain = y[:N] XVal = X[N:,:] #use the rests for validation yVal = y[N:] # print type(XTrain) matrix model = gnb() model.fit(XTrain,yTrain) training = model.predict(XTrain) validation = model.predict(XVal) print "NB" print "Training error ", np.mean(np.abs(yTrain - training)) print "Validation error ", np.mean(np.abs(yVal - validation)) Xtest = test_data.values result = model.predict(Xtest) np.savetxt('result/resultNB.txt', result)
from datahandle import handle_data from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.tree import DecisionTreeClassifier as dtc from sklearn.linear_model import LogisticRegression as lr from sklearn.naive_bayes import GaussianNB as gnb dat = handle_data('cleveland') dat.read_data() X_train, X_test, y_train, y_test = dat.partition() models = [rfc(), dtc(), lr(), gnb()] def pred_prob(model): model.fit(X_train, y_train) true = y_test pred = model.predict(X_test) prob = model.predict_proba(X_test) return true, pred, prob