Пример #1
0
def get_model(choice='lr', class_weight=None):
    if choice == 'svc':
        model = svc(verbose=1, class_weight=class_weight, n_jobs=-1)

    elif choice == 'lsvc':
        model = lsvc(class_weight=class_weight, n_jobs=-1)
    elif choice == 'knn':
        model = KNeighborsClassifier()
    elif choice == 'msvm':
        model = MulticlassSVM(C=0.1,
                              tol=0.01,
                              max_iter=100,
                              random_state=0,
                              verbose=1)

    elif choice == 'gnb':
        model = gnb(class_weight=class_weight)

    elif choice == 'gpc':
        model = gpc(class_weight=class_weight)
    elif choice == 'sgdc':
        model = sgdc(class_weight=class_weight)

    elif choice == 'rf':
        model = rf(class_weight=class_weight)
#   elif choice == 'vw':
#         model = vw()
    else:
        model = lr(class_weight=class_weight)
    return model
Пример #2
0
def _gnb(t, min_freq, save=False):
    if save:
        clf = gnb().fit(records, labels)
        save_classifier(clf, t, 'gnb', min_freq)
        return ('gnb', clf)
    else:
        clf = load_classifier(t, 'gnb', min_freq)
        return ('gnb', clf)
def test_step(index, train_df, train_labels, test_df, train_step):
  clf = gnb()
  train_data = train_df[index].values
  test_data = test_df[index].values
  clf.fit(train_data, train_labels)
  score = clf.score(test_data, test_labels)
  print('List length: %d' % len(index))
  print('The index list is: ', index)
  print('The score is: %.3f ' % score)
  return score, clf
Пример #4
0
def train_model(ft, lbl):
    from sklearn.naive_bayes import GaussianNB as gnb
    model = gnb()
    #model = rfc(random_state=0)
    #model = knn(n_neighbors=10, weights='distance', p=2)
    ft[:, 0:2] = np.zeros((ft.shape[0], 2))
    ft[:, 3:4] = np.zeros((ft.shape[0], 1))

    print ft
    model.fit(ft, lbl)
    return model
Пример #5
0
def main():
    "main program"
    
    app = get_app_title()
    appf = get_app_file()
    
    loans_df, loans_y, test_df, test_y, numeric_vars = load_data()
    indep_vars = numeric_vars
    print("numeric_vars\n", numeric_vars)
    
    plotdir = make_plotdir()
    
    loans_X = loans_df
    test_X = test_df
    clf = gnb()         # skip scaling for now, score 87%
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y)
    
    loans_X, my_scaler = scale_train_data(loans_df, print_out=True)
    test_X = scale_test_data(my_scaler, test_df)
    clf = gnb()     # add scaling, score 87%
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "allscale", indep_vars, test_df, test_y, pred_y)
    
    # gnb has no meta-parameters to explore, optimize
    
    loans_X = loans_df
    test_X = test_df
    clf = gnb()   # score 84% +- 4%
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    clf = gnb()    # best score 89% +- 4%
    opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=False)
    
    # redo with optimized columns
    loans_X = loans_df[opt_list]
    test_X = test_df[opt_list]
    clf = gnb()         # best score 89% +- 4%
    cross_validate(clf, loans_X, loans_y, print_out=True)
    
    clf = gnb()         # fit score 89%, predict score 91%
    do_fit(clf, loans_X, loans_y, print_out=True)
    pred_y = do_predict(clf, test_X, test_y, print_out=True)  
    plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
def hoursperweek_test(x):
    if(x<50):
        return(0)
    else:
        return(1)
        
def capitalgain_test(x):
    if(x==0):
        return(0)
    else:
        return(1)

data_test['age'],_=pd.factorize(data_test['age'].apply(age_test))
data_test['hoursperweek'],_=pd.factorize(data_test['hoursperweek'].apply(hoursperweek_test))
data_test['capitalgain'],_=pd.factorize(data_test['capitalgain'].apply(capitalgain_test))
data_test=data_test.drop('educationno',axis=1) 
data_test=data_test.drop('capitalgain',axis=1)  
    
x_train=data_train.drop("Salary",axis=1)
y_train=data_train["Salary"]

x_test=data_test.drop("Salary",axis=1)
y_test=data_test["Salary"]

model=gnb().fit(x_train,y_train)

prediction=model.predict(x_test)

#accuracy
np.mean(y_test==prediction)
Пример #7
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting classifier to the Training set
# Create your classifier here

from sklearn.naive_bayes import GaussianNB as gnb
classifier = gnb()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
Пример #8
0
train, test = train_test_split(data_mod, test_size=0.2)

# X = diabetes[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
# y = diabetes[['Outcome']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2)

print(data_mod.shape)
print(train.shape)
print(test.shape)

features = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Age',
    'Insulin', 'DiabetesPedigreeFunction'
]
target = 'Outcome'
classifiers = [knnc(), dtc(), SVC(gamma='auto'), SVC(kernel='linear'), gnb()]
classifier_names = [
    'K nearest neighbors', 'Decision Tree Classifier',
    'SVM classifier with RBF kernel', 'SVM classifier with linear kernel',
    'Gaussian Naive Bayes'
]

for clf, clf_name in zip(classifiers, classifier_names):
    cv_scores = cross_val_score(clf, train[features], train[target], cv=5)

    print(clf_name, ' mean accuracy: ', round(cv_scores.mean() * 100, 3),
          '% std: ', round(cv_scores.var() * 100, 3), '%')

final_model_smv_lin = SVC(kernel='linear',
                          probability=True).fit(train[features], train[target])
# final_model_gnb = gnb().fit(train[features], train[target])
Пример #9
0
    predictions = []
    for classifier in f:
        scores = classifier.predict_proba(x)[:, 0]
        print(scores)
        predictions.append(scores)
    predictions = np.array(predictions).T
    predictions = [guess.tolist().index(max(guess)) for guess in predictions]
    pos = [int(predict == trueval) for predict, trueval in zip(predictions, y)]
    acc = sum(pos) / len(pos)
    print(acc)
    return


data = (np.loadtxt('smarthome.csv', delimiter=','))
x, y = split_data(data)
reg_stats = show_stats(x, y, gnb(), 10, 'Basic Gaussian Naive Bayes')

disc = discretize_interval(data)
x, y = split_data(disc)
disc_stats = show_stats(x, y, gnb(), 10, "Discretized Naive Bayes")

tval, pval = ttest_ind(reg_stats['test_score'], disc_stats['test_score'])
print("tval", tval, "pval", pval)

x_train = x[0:int(len(x) * 2 / 3)]
x_test = x[int(len(x) * 2 / 3):-1]
y_train = y[0:int(len(y) * 2 / 3)]
y_test = y[int(len(x) * 2 / 3):-1]

e = one_vs_all_train(x_train, y_train, gnb(), range(8))
one_vs_all_test(x_test, y_test, e)
Пример #10
0
from sklearn.naive_bayes import GaussianNB as gnb, BernoulliNB as bnb

with open('train.json') as data:
    train = json.load(data)

cuisine = []
ingredients = []
for i in train:
    cuisine.append(i["cuisine"])
    ingredients.extend(i["ingredients"])

singredients = list(set(ingredients))
traind = []
d = {singredients[i]: i for i in range(len(singredients))}
for i in train:
    row = [0] * len(singredients)
    for j in i["ingredients"]:
        row[d[j]] = 1
    traind.append(row)

k_fold = kf(n_splits=3)
ga = cvs(gnb(), traind, cuisine, cv=k_fold, n_jobs=-1)
ba = cvs(bnb(), traind, cuisine, cv=k_fold, n_jobs=-1)

f = open('2d', 'wb')
s = "Gaussian accuracy is: " + str(np.mean(ga))
print s
f.write(s)
s = "Bernoulli accuracy is: " + str(np.mean(ba))
print s
f.write(s)
Пример #11
0
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB as gnb
#from sklearn.linear_model import LogisticRegression as lr
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import AdaBoostClassifier as abc
from sklearn.ensemble import GradientBoostingClassifier as gbc
#from sklearn.svm import SVC as svc

clf1 = gnb()
#clf2 = lr()
clf3 = rfc()
clf4 = abc()
clf5 = gbc()
#clf6 = svc()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
Пример #12
0
#
#####################################################

import pandas as pd

if __name__ == '__main__':

    train_features = pd.read_csv('train/train_features.csv')

    columns = train_features.columns[3:]

    train_labels = train_features["drop"].values
    train_features = train_features[columns].values

    from sklearn.naive_bayes import GaussianNB as gnb
    clf = gnb()
    from sklearn.cross_validation import cross_val_score
    from sklearn.cross_validation import train_test_split

    score = cross_val_score(clf, train_features, train_labels, cv=5, scoring='accuracy')
    print score
    #X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.4, random_state=1)


    #clf.fit(train_features, train_labels)
    #clf.fit(X_train, y_train)
    #print clf.score(X_test, y_test)
    #print "fit susscess"
    del train_features
    del train_labels
"""
Пример #13
0
import numpy as np
from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.metrics import accuracy_score
from time import time

X = np.array([[2, 5], [3, 6], [1, 7], [1, 2], [4, 3], [6, 8], [7, 3], [6, 1],
              [8, 7], [9, 3]])
Y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2])

startTime = time()
clf = gnb()
clf.fit(X, Y)
pred = clf.predict([[0, 1]])
print(pred)

testX = np.array([[1, 9], [3, 1], [4, 7], [6, 5], [5, 5], [7, 9]])
testY = np.array([1, 1, 1, 2, 2, 2])

pred = clf.predict(testX)
print("Accuracy ",
      accuracy_score(testY, pred) * 100)

print("Time ", round(time() - startTime, 3), "sec")
Пример #14
0
        scores.append(sum(stats['test_score']) / len(stats['test_score']))
    for score in scores:
        print('monk', i, ':', score)
    print('-----------------------------')
    return sum(scores) / len(scores)


x1, y1 = read_data("monks-1.csv")
x2, y2 = read_data("monks-2.csv")
x3, y3 = read_data("monks-3.csv")

feats = [x1, x2, x3]
labs = [y1, y2, y3]

print('***Using 3-fold validation***')

worst = show_stats(feats, labs, pct(max_iter=100, tol=0), 3, 'perceptron')
best = show_stats(feats, labs, dt(max_depth=10), 3, 'decision tree')
show_stats(feats, labs, knn(n_neighbors=3), 3, 'K-nearest-neighbors')
show_stats(feats, labs, gnb(), 3, 'Gaussian Naive Bayes')

print('t test between perceptron and decision tree:', ttest_ind(worst, best))

print('***Using Leave-one-out***')

worst = show_stats(feats, labs, pct(max_iter=50, tol=0), loo(), 'perceptron')
best = show_stats(feats, labs, dt(max_depth=10), loo(), 'decision tree')
show_stats(feats, labs, knn(n_neighbors=3), loo(), 'K-nearest-neighbors')
show_stats(feats, labs, gnb(), loo(), 'Gaussian Naive Bayes')

print('t test between perceptron and decision tree:', ttest_ind(worst, best))
Пример #15
0
def predictByGNB(features, classes, test):
    #No parameters need
    clf = gnb()
    clf.fit(features, classes)
    return clf.predict(test)
print("LABEL:")
print(labels.head())
feature_train,feature_test,label_train,label_test=train_test_split(featuers,labels,test_size=0.3,train_size=0.7)


# 1.Using KNN Classification
model = KNeighborsClassifier(5) # The knearest k=5
model.fit(feature_train, label_train)
predicts=model.predict(feature_test)
print("PREDICT RESULT Using KNN:",predicts)
accuracy = accuracy_score(label_test, predicts)
print('Accuracy of KNN classifier :',accuracy) # =82%


# 2.Using Naive Bayes Classification
model = gnb()
model.fit(feature_train, label_train)
predicts=model.predict(feature_test)
print("PREDICT RESULT Using Naive Bayes:",predicts)
accuracy = accuracy_score(label_test, predicts)
print('Accuracy of Naive Bayes classifier :',accuracy) # =82%

# 3.Using Decision Tree Induction Classification

# First we need to convert continuous values into categorial as much as we can
print(max(featuers.age),min(featuers.age)) # to know pins range #70 #32
ageCategory=pd.cut(featuers.age,bins=[0,17,32,65,100],labels=['child','teenager','adult','elderly'])
cigsPerDayCategory=pd.cut(featuers.cigsPerDay,bins=[-1,2.0,5.0,7.0,20.0],labels=['low','medium','high','veryHigh'])
featuers.insert(2,"ageCategory",ageCategory)
featuers.insert(4,"cigsPerDayCategory",cigsPerDayCategory)
# Now after adding a categorial columns we need to drop continuous values columns
Пример #17
0
def naive_bayes_speed_test(dftrain, dftrain_y, plotdir):
    atitle = 'Naive Bayes'
    afile = 'nbayes'   
    clf = gnb()
#   speed_test_medium(clf, dftrain, dftrain_y, atitle, afile, plotdir)
    speed_test_large(clf, dftrain, dftrain_y, atitle, afile, plotdir)
Пример #18
0
]

target = ['defects']

# print(df)

# print(df.shape)
# print(train[features].shape)
# print(train[target].shape)
# print(train[target].values.ravel().shape)
# # print(test.shape)

# Y = train[target].values.reshape(train[target].shape[0])
# print(Y)

classifiers = [knnc(), dtc(), SVC(), SVC(kernel='linear'), gnb()]

classifier_names = [
    'K nearest neighbors', 'Decision Tree Classifier',
    'SVM classifier with RBF kernel', 'SVM classifier with linear kernel',
    'Gaussian Naive Bayes'
]

# for clf, clf_name in zip(classifiers, classifier_names):
#     cv_scores = cross_val_score(clf, train[features], train[target], cv=5)

#     print(clf_name, ' mean accuracy: ', round(cv_scores.mean()*100, 3), \
#     	'% std: ', round(cv_scores.var()*100, 3),'%')

# final_model_smv_lin = SVC(kernel='linear').fit(train[features], Y)
final_model_gnb = gnb().fit(train[features], train[target])
Пример #19
0
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.metrics import accuracy_score
data = pd.read_csv('pima-indians-diabetes.csv')
print data.describe()
features = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Age',
    'Insulin', 'DiabetesPedigreeFunction'
]
target = 'Class'
train, test = train_test_split(data, test_size=0.2)
clf = gnb().fit(train[features], train[target])
y_predicted = clf.predict(test[features])
print "Accuracy ", round(accuracy_score(test[target], y_predicted) * 100,
                         2), " %"

#///////////////////////OUTPUT//////////////////////////////////////////////////////
# chanchald@chanchald-X553SA:~$ cd Desktop
# chanchald@chanchald-X553SA:~/Desktop$ cd p2
# chanchald@chanchald-X553SA:~/Desktop/p2$ python script.py
#        Pregnancies     Glucose  BloodPressure  SkinThickness         BMI  \
# count   768.000000  768.000000     768.000000     768.000000  768.000000
# mean      3.845052  120.894531      69.105469      20.536458   79.799479
# std       3.369578   31.972618      19.355807      15.952218  115.244002
# min       0.000000    0.000000       0.000000       0.000000    0.000000
# 25%       1.000000   99.000000      62.000000       0.000000    0.000000
# 50%       3.000000  117.000000      72.000000      23.000000   30.500000
# 75%       6.000000  140.250000      80.000000      32.000000  127.250000
# max      17.000000  199.000000     122.000000      99.000000  846.000000
Пример #20
0
get_dummies('Parch')

#Replacing single missing fare with mean
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

#Standard Scaling
#Scales the data so that it has mean 0 and variance 1
#Removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
train_data['Fare'] = sc.fit_transform(train_data[['Fare']])
test_data['Fare'] = sc.fit_transform(test_data[['Fare']])

from sklearn.model_selection import train_test_split as tts

x = train_data.drop(['PassengerId','Survived'],axis=1)
y = train_data['Survived']

x_train,x_test,y_train,y_test = tts(x,y,test_size=0.2,random_state=0)

#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.metrics import accuracy_score as acc

gaussian = gnb()
gaussian.fit(x_train,y_train)
y_pred = gaussian.predict(x_test)
acc_gn = round(acc(y_pred,y_test)*100,2)
print("Accuracy Using Naive Bayes ",acc_gn)
Пример #21
0
dataset2 = pd.concat([dataset2, Embarked_dum], axis=1)

dataset2 = dataset2.drop("Sex", axis=1)
dataset2 = dataset2.drop("Embarked", axis=1)

x = dataset2.iloc[:, [2, 4, 5, 6, 8, 10, 11, 12, 13]].values
y = dataset2.iloc[:, [1]].values
x = x.astype(float)

import statsmodels.api as st
x = np.append(arr=np.ones((889, 1)).astype(int), values=x, axis=1)
x_o = x[:, [0, 1, 2, 3, 6, 7, 8, 9]]
reg_OLS = st.OLS(endog=y, exog=x_o).fit()
reg_OLS.summary()

from sklearn.model_selection import train_test_split as tts
x_train, x_test, y_train, y_test = tts(x_o, y, test_size=0.2, random_state=23)

from sklearn import preprocessing
mms = preprocessing.MinMaxScaler(feature_range=(0, 1))
x_train_mms = mms.fit_transform(x_train)
x_test_mms = mms.fit_transform(x_test)

from sklearn.naive_bayes import GaussianNB as gnb
cf = gnb()
cf.fit(x_train_mms, y_train)
y_pre = cf.predict(x_test_mms)

from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pre))
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest

selection = SelectKBest(k=10)
X_new = selection.fit(X_poly[:72325, :], Y).transform(X_poly)

X = X_new[:72325, :]
X_sub = X_new[72325:, :]

### Classifiers

KNC3 = KNeighborsClassifier(n_neighbors=5)
SVM = svm.SVC(probability=True)
GNB = gnb()
DT = DecisionTreeClassifier(criterion='gini', random_state=1)
GBC = GradientBoostingClassifier(n_estimators=8000, loss='deviance')
MPL = MLPClassifier(alpha=1e-5,
                    activation='relu',
                    random_state=1,
                    hidden_layer_sizes=(100, 100, 100, 100))
RFC = RandomForestClassifier(n_estimators=300,
                             criterion='gini',
                             random_state=1,
                             oob_score=True)
SGD = SGDClassifier()

agg = VotingClassifier(estimators=[('SVM', SVM), ('GNB', GNB), ('RFC', RFC)],
                       voting='soft',
                       weights=[1, 1, 1])
Пример #23
0
    return grid.best_estimator_


# try logistic regression
lr = lrc(random_state=seed)
params = {"C": [10000, 15000]}
lr1 = simple_gridsearch(lr, params)

# try random forest
rf = RandomForestClassifier(random_state=seed)
params = {'n_estimators': [250, 500]}
rf1 = simple_gridsearch(rf, params)

# try gaussian bayes
t0 = time.time()
nb = gnb()
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train, y)
nb.fit(Xtrain, ytrain)
prediction = nb.predict(Xtest)
print roc_auc_score(ytest, prediction)
print 'timeused: ', time.time() - t0

# try KNN
knn = KNeighborsClassifier()
params = {"n_neighbors": [5, 10, 15, 20]}
knn1 = simple_gridsearch(knn, params)

# try gradient boosting
gb = GradientBoostingClassifier(random_state=seed)
params = {"n_estimators": [100, 250, 500]}
gb1 = simple_gridsearch(gb, params)
Пример #24
0
import pandas as pd
import regression as reg
from sklearn.naive_bayes import GaussianNB as gnb
N = 20000

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data = train_data.drop(['url'], axis=1) #remove 'url' information.
train_data = train_data.drop(['timedelta'], axis=1) #remove 'url' information.
X = np.array(train_data.drop(['shares'], axis=1))
y = np.array(train_data['shares']) #This is the target

XTrain = X[:N,:] #use the first N samples for training
yTrain = y[:N]
XVal = X[N:,:] #use the rests for validation
yVal = y[N:]

# print type(XTrain) matrix
model = gnb()
model.fit(XTrain,yTrain)
training = model.predict(XTrain)
validation = model.predict(XVal)

print "NB"
print "Training error ", np.mean(np.abs(yTrain - training))
print "Validation error ", np.mean(np.abs(yVal - validation))
Xtest = test_data.values
result = model.predict(Xtest)
np.savetxt('result/resultNB.txt', result)
Пример #25
0
from datahandle import handle_data
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.linear_model import LogisticRegression as lr
from sklearn.naive_bayes import GaussianNB as gnb

dat = handle_data('cleveland')
dat.read_data()
X_train, X_test, y_train, y_test = dat.partition()

models = [rfc(), dtc(), lr(), gnb()]


def pred_prob(model):
    model.fit(X_train, y_train)
    true = y_test
    pred = model.predict(X_test)
    prob = model.predict_proba(X_test)
    return true, pred, prob