예제 #1
0
acc = model.score(X_test, y_test)  #根据给定数据与标签返回正确率的均值
print('逻辑回归模型评价:', acc)
#参数说明:
#penalty:使用指定正则化项(默认:l2)
#dual: n_samples > n_features取False(默认)
#C:正则化强度的反,值越小正则化强度越大
#n_jobs: 指定线程数
#random_state:随机数生成器
#fit_intercept: 是否需要常量

# 朴素贝叶斯
#贝叶斯分类是一类分类算法的总称,这类算法均以贝叶斯定理为基础,故统称为贝叶斯分类。而朴素朴素贝叶斯分类是贝叶斯分类中最简单,也是常见的一种分类方法
model = sk_bayes.MultinomialNB(alpha=1.0, fit_prior=True,
                               class_prior=None)  #多项式分布的朴素贝叶斯
model = sk_bayes.BernoulliNB(alpha=1.0,
                             binarize=0.0,
                             fit_prior=True,
                             class_prior=None)  #伯努利分布的朴素贝叶斯
model = sk_bayes.GaussianNB()  # 高斯分布的朴素贝叶斯
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)  #根据给定数据与标签返回正确率的均值
print('朴素贝叶斯(高斯分布)模型评价:', acc)
#参数说明:
#alpha:平滑参数
#fit_prior:是否要学习类的先验概率;false-使用统一的先验概率
#class_prior: 是否指定类的先验概率;若指定则不能根据参数调整
#binarize: 二值化的阈值,若为None,则假设输入由二进制向量组成

#决策树
model = sk_tree.DecisionTreeClassifier(criterion='entropy',
                                       max_depth=None,
                                       min_samples_leaf=1,
예제 #2
0
clf_RF,p =model_RF()
# model evaluation
acc_RF = model_evaluation(clf_RF, y_test)
print ('{0:10s} {1:.1f}'.format('AUC Score',auc_score(y_test, p[:,1])*100))

# Accuracy for all Models
accuracy_normal=[acc_LR, acc_NB, acc_RF, acc_DT]
accuracy_normal=[('{0:2f}'.format(i*100)) for i in accuracy_normal]

###############################################################
# 4.5 Cross Validation for all models
###############################################################
# Logistic Regression
clf1 = LogisticRegression(tol=1e-8, penalty='l2', C=2)
# Naive Bayes
clf2 = nb.BernoulliNB(alpha=1.0, binarize=0.0)
# Decision Tree
clf3 = DecisionTreeClassifier(max_depth=100)
# Random Forest
clf4 = RandomForestClassifier(n_estimators=100)

models=[clf1,  clf2, clf3, clf4]

n_Folds = 10
# Accuracy after cross validation:
accuracy_cv = []
for clf in models:
    accuracy_common = 0
    for test_run in range(n_Folds):
        # (X_train, X_test, y_train, y_test) = train_test_split(X,, test_size=.2)
        # call classifier
예제 #3
0
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    # GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    # Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    # SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    # Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    # Discriminant Analysis
예제 #4
0
def vocabvec(word, vocab):
    vector = [0] * len(vocab)
    for i in range(len(vocab)):
        if vocab[i] in word:
            vector[i] = 1
    return vector


X = [
    'my dog has flea problems help please',
    'maybe not take him to dog park stupid',
    'my dalmation is so cute I love him',
    'stop posting stupid worthless garbage',
    'mr licks ate my steak how to stop him',
    'quit buying worthless dog food stupid'
]
train_data_Y = [0, 1, 0, 1, 0, 1]

words = [get_word(x) for x in X]
vocab = get_vocab(words)
train_data_X = [vocabvec(word, vocab) for word in words]

from sklearn import naive_bayes as nb
clf = nb.BernoulliNB()

clf.fit(train_data_X, train_data_Y)

mail = 'my dog stupid'
word = get_word(mail)
vector = vocabvec(word, vocab)
print clf.predict([vector])
예제 #5
0
    def training(self, train_pct, output_path, output_scaler_path, model_name, properties):
        K.clear_session()
        self.graph = tf.get_default_graph()
        with self.graph.as_default():
            self.model_name = model_name
            df = pd.read_csv(self.fname)
            df = df.sample(frac=1)

            df_norm = df
    #        df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1))
    #        df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))

            new_df = df_norm.sample(frac=1, random_state=self.RSEED)
            
            x = new_df.drop('Class', axis=1)
            y = new_df['Class']
            x, tx, y, ty = train_test_split(x, y, train_size=train_pct, stratify=y)

            train_x = x.values
            train_y = y.values
            test_x = tx.values

            sc = StandardScaler()
            train_x = sc.fit_transform(train_x)
            pickle.dump(sc, open(output_scaler_path, "wb"))
            
            if model_name == ModelNames.RANDOM_FOREST:
                
                rf = RandomForestClassifier(**properties, random_state=self.RSEED)
                rf.fit(train_x, train_y)

                pickle.dump(rf, open(output_path, "wb"))

                self.model = rf
            
            elif model_name == ModelNames.LOGISTIC_REGRESSION:
                lr = LogisticRegression(**properties, random_state=self.RSEED)
                lr.fit(train_x, train_y)
                pickle.dump(lr, open(output_path, "wb"))
                self.model = lr

            elif model_name == ModelNames.ADAPTIVE_BOOST:
                ada = AdaBoostClassifier(**properties, random_state=self.RSEED)
                ada.fit(train_x, train_y)
                pickle.dump(ada, open(output_path, "wb"))
                self.model = ada

            elif model_name == ModelNames.NAIVE_BAYES:
                nb = naive_bayes.BernoulliNB(**properties)
                nb.fit(train_x, train_y)
                pickle.dump(nb, open(output_path, "wb"))
                self.model = nb

            elif model_name == ModelNames.SVM:
                clf = svm.SVC(random_state=self.RSEED, **properties)
                clf.fit(train_x, train_y)
                pickle.dump(clf, open(output_path, "wb"))
                self.model = clf

            elif model_name == ModelNames.OCSVM:
                ocsvm = svm.OneClassSVM(**properties)
                train_x = train_x[train_y == 0]
                ocsvm.fit(train_x)
                pickle.dump(ocsvm, open(output_path, "wb"))
                self.model = ocsvm

            elif model_name == ModelNames.AUTOENCODED_DEEP_LEARNING:

                input_dimension = train_x.shape[1]
                learning_rate = properties.get('learning_rate')
                encoding_dimension = properties.get('encoding_dimension')
                if learning_rate is None:
                    learning_rate = 1e-7
                else:
                    del properties['learning_rate']

                if encoding_dimension is None:
                    encoding_dimension = input_dimension
                else:
                    del properties['encoding_dimension']            

                input_layer = Input(shape=(input_dimension, ))
                    
                Encoder1 = Dense(encoding_dimension, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
                Encoder2 = Dense(int(encoding_dimension/2), activation="relu")(Encoder1)
                Encoder3 = Dense(int(encoding_dimension/4), activation="tanh")(Encoder2)
                Decoder1 = Dense(int(encoding_dimension/4), activation="relu")(Encoder3)
                Decoder2 = Dense(int(encoding_dimension/2), activation="tanh")(Decoder1)
                Decoder3 = Dense(input_dimension, activation="softmax")(Decoder2)

                AutoEncoderModel = Model(inputs=input_layer, outputs=Decoder3)
                AutoEncoderModel.compile(metrics=['accuracy'], loss='mean_squared_error', optimizer='adam')
                # AutoEncoderModel.compile(metrics=['accuracy'], loss=properties.get('loss'), optimizer=properties.get('optimizer'))

                cp = ModelCheckpoint(filepath=output_path, save_best_only=True)
                shuffle = True
                if self.RSEED is None:
                    shuffle = False
                history = AutoEncoderModel.fit(train_x, train_x,
                                            epochs=properties.get('epochs'),
                                            batch_size=properties.get('batch_size'),
                                            shuffle=shuffle,
                                            verbose=1,
                                            callbacks=[cp], 
                                            validation_data=(test_x, test_x)).history
                
                self.model = AutoEncoderModel
    classifier.fit(feature_vector_train, label)

    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.accuracy_score(predictions, valid_y), confusion_matrix(
        valid_y, predictions)


# In[11]:

# Naive Bayes on Ngram Level TF IDF Vectors
NBAccuracy = train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_ngram,
                         train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", NBAccuracy)

# In[12]:

# SVM SVC on Ngram Level TF IDF Vectors
SVCAccuracy = train_model(SVC(kernel='linear'), xtrain_tfidf_ngram, train_y,
                          xvalid_tfidf_ngram)
print("SVC, N-gram TF-IDF: ", SVCAccuracy)

# In[13]:

# SVM NuSVC on Ngram Level TF IDF Vectors
NuSVCAccuracy = train_model(NuSVC(kernel='linear', probability=True),
                            xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
예제 #7
0
import pickle
from sklearn import naive_bayes

with open('../DataSet-Release 1/ds2/ds2Train.csv', 'r') as myFile:
    ds2Train = [line.split(',') for line in myFile.read().split('\n')
                ]  #[1:]#Remove header, but training set does not have header

ds2Train.pop()
#print (ds2Train[0])
featuresds2T = [d[:-1] for d in ds2Train]
featuresds2T = [[int(x) for x in row]
                for row in featuresds2T]  #Convert chars to int
labelsds2T = [d[-1] for d in ds2Train]
labelsds2T = [int(x) for x in labelsds2T]  #Convert chars to int

#Train using training set
nBclassifier = naive_bayes.BernoulliNB()
nBclassifier.fit(featuresds2T, labelsds2T)

#Export trained model
with open('naive_bayes_model_ds2.pkl', 'wb') as myFile:
    pickle.dump(nBclassifier, myFile)
예제 #8
0
def create_naive_bayes():
    model = naive_bayes.BernoulliNB()
    return model
예제 #9
0
            'occu_ Machine-op-inspct', 'occu_ Other-service', 'occu_ Priv-house-serv', 'occu_ Prof-specialty', 'occu_ Protective-serv', 'occu_ Sales',
            'occu_ Tech-support', 'occu_ Transport-moving']
data = data.reindex(columns=reorder_colnames)
data = pd.get_dummies(data, columns=['race'])

features = ['age', 'fnlwgt', 'work_ Private','work_ Self-emp','work_ Government', 'edunum', 'marital', 'relation', 'sex', 'gain', 'loss', 'hpw', 'country',
            'occu_ Adm-clerical', 'occu_ Armed-Forces', 'occu_ Craft-repair', 'occu_ Exec-managerial', 'occu_ Farming-fishing', 'occu_ Handlers-cleaners',
            'occu_ Machine-op-inspct', 'occu_ Other-service', 'occu_ Priv-house-serv', 'occu_ Prof-specialty', 'occu_ Protective-serv', 'occu_ Sales',
            'occu_ Tech-support', 'occu_ Transport-moving', 'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White']
y = data['income']
X = data[features]
print('Using features: ' + str(features))

# Define the Naive Bayes models
gaussianModel = nb.GaussianNB()
bernoulliModel = nb.BernoulliNB()
multinomialModel = nb.MultinomialNB()
complementModel = nb.ComplementNB()

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Test options and evaluation metric
scoring = 'accuracy'

# Fit the training sets
gaussianModel.fit(X_train, y_train)
bernoulliModel.fit(X_train, y_train)
multinomialModel.fit(X_train, y_train)
complementModel.fit(X_train, y_train)
예제 #10
0
def document_everything(X_train, X_test, y_train, y_test):
    Reports = {}
    Accuracies = {}
    conf_matrices = {}
    
    clf = svm.LinearSVC(C=10)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['SVM'] = clf.score(X_test, y_test)
    Reports['SVM'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['SVM'] = metrics.confusion_matrix(y_test, y_pred)

    print('SVM Done')

    clf = ensemble.RandomForestClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['RF'] = clf.score(X_test, y_test)
    Reports['RF'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['RF'] = metrics.confusion_matrix(y_test, y_pred)

    print('RF Done')

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['DT'] = clf.score(X_test, y_test)
    Reports['DT'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['DT'] = metrics.confusion_matrix(y_test, y_pred)

    print('DT Done')
    
    clf = neighbors.KNeighborsClassifier(n_neighbors=1,leaf_size=10)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['KNN'] = clf.score(X_test, y_test)
    Reports['KNN'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['KNN'] = metrics.confusion_matrix(y_test, y_pred)

    print('KNN Done')

    clf = naive_bayes.MultinomialNB()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['MNB'] = clf.score(X_test, y_test)
    Reports['MNB'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['MNB'] = metrics.confusion_matrix(y_test, y_pred)
    
    print('MNB Done')
    
    clf = naive_bayes.BernoulliNB(alpha=1e-10)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['BNB'] = clf.score(X_test, y_test)
    Reports['BNB'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['BNB'] = metrics.confusion_matrix(y_test, y_pred)

    print('BNB Done')

    clf = naive_bayes.ComplementNB()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['CNB'] = clf.score(X_test, y_test)
    Reports['CNB'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['CNB'] = metrics.confusion_matrix(y_test, y_pred)

    print('CNB Done')
    
    clf = neural_network.MLPClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Accuracies['MLP'] = clf.score(X_test, y_test)
    Reports['MLP'] = metrics.classification_report(y_test, y_pred)
    conf_matrices['MLP'] = metrics.confusion_matrix(y_test, y_pred)

    print('CNB Done')

    for clf in Reports:
        print(str(clf))
        print(str(Accuracies[clf]))
        print(str(Reports[clf]))
        print(str(conf_matrices[clf]))
        size = [conf_matrices[clf][0][0],conf_matrices[clf][0][1],conf_matrices[clf][1][0],conf_matrices[clf][1][1]]
        labels = 'True Negatives', 'False Positive', 'False Negative', 'True Positives'
        explode = (0,.1,0,.1)
        fig1, ax1 = plt.subplots()
        ax1.pie(size, explode=explode, labels = labels)
        ax1.axis('equal')
        plt.show()
예제 #11
0
파일: titanic.py 프로젝트: bcohen1/Titanic
def main():
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    combine = [train_df, test_df]

    for df in combine:
        df.info()
        standardize_data(df)
        create_columns(df)
        create_bins(df)
        encode_data(df)
    # Define target (Y variable)
    target = ["Survived"]

    # Define features (X variables)
    train_df_x = [
        "Pclass",
        "Sex",
        "Age",
        "SibSp",
        "Parch",
        "Fare",
        "Embarked",
        "FamilySize",
        "IsAlone",
        "Title",
    ]

    # Define numerical features (binned and encoded)
    train_df_x_bin = [
        "Pclass",
        "Sex_Code",
        "AgeBin_Code",
        "FareBin_Code",
        "Embarked_Code",
        "FamilySize",
        "IsAlone",
        "Title_Code",
    ]

    # Analyze feature correlation with target
    for x in train_df_x:
        if train_df[x].dtype != "float64":
            print(train_df[[x, target[0]]].groupby(x).mean())

    # Graph individual features by survival
    fig, axis = plt.subplots(1, 3, figsize=(9, 6))
    sns.histplot(x="Fare",
                 data=train_df,
                 hue="Survived",
                 multiple="stack",
                 ax=axis[0])
    sns.histplot(x="Age",
                 data=train_df,
                 hue="Survived",
                 multiple="stack",
                 ax=axis[1])
    sns.histplot(x="FamilySize",
                 data=train_df,
                 hue="Survived",
                 multiple="stack",
                 ax=axis[2])

    fig, axis = plt.subplots(2, 3, figsize=(16, 12))
    sns.barplot(x="Pclass", y="Survived", data=train_df, ax=axis[0, 0])
    sns.barplot(x="Sex", y="Survived", data=train_df, ax=axis[0, 1])
    sns.barplot(x="Embarked", y="Survived", data=train_df, ax=axis[0, 2])
    sns.barplot(x="IsAlone", y="Survived", data=train_df, ax=axis[1, 0])
    sns.barplot(x="Title", y="Survived", data=train_df, ax=axis[1, 1])

    # Compare class with a 2nd feature
    fig, axis = plt.subplots(1, 3, figsize=(9, 6))
    sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Sex", ax=axis[0])
    sns.barplot(x="Pclass",
                y="Survived",
                data=train_df,
                hue="IsAlone",
                ax=axis[1])
    sns.barplot(x="Pclass",
                y="Survived",
                data=train_df,
                hue="Embarked",
                ax=axis[2])

    # Compare Sex with a 2nd feature
    fig, axis = plt.subplots(1, 3, figsize=(9, 6))
    sns.barplot(x="Sex", y="Survived", data=train_df, hue="Pclass", ax=axis[0])
    sns.barplot(x="Sex",
                y="Survived",
                data=train_df,
                hue="IsAlone",
                ax=axis[1])
    sns.barplot(x="Sex",
                y="Survived",
                data=train_df,
                hue="Embarked",
                ax=axis[2])

    # Correlation heatmap of dataset
    fig, ax = plt.subplots(figsize=(14, 12))
    fig = sns.heatmap(
        train_df.corr(),
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        annot=True,
        ax=ax,
    )

    # Machine Learning Algorithm (MLA) selection and initialization
    mla = [
        linear_model.LogisticRegressionCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(dual=False),
        neighbors.KNeighborsClassifier(),
        gaussian_process.GaussianProcessClassifier(),
        naive_bayes.GaussianNB(),
        naive_bayes.BernoulliNB(),
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.RandomForestClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.AdaBoostClassifier(),
        ensemble.GradientBoostingClassifier(),
    ]

    mla_compare = test_models(mla, train_df, train_df_x_bin, target)

    best_estimator = optimize_params(mla, mla_compare, train_df,
                                     train_df_x_bin, target)

    generate_submission_csv(test_df, train_df_x_bin, best_estimator)
예제 #12
0
color = pd.get_dummies(test['color'])
test_data = pd.concat([test, color[all_colors]], axis=1)

color = pd.get_dummies(train['color'])
train_data = pd.concat([train, color[all_colors]], axis=1)

train_data = train_data.drop('id', axis=1, inplace=False)
test_data = test_data.drop('id', axis=1, inplace=False)

# Asert that test set has only the 'type' column left
train_data.columns - test_data.columns == ['type']
# MODELS

# Naive Bayes
nb = {'name': 'Bernoulli NaiveBayes'}
nb['model'] = naive_bayes.BernoulliNB()

# Logistic Regression
lr = {'name': 'Logistic Regression'}
lr['model'] = linear_model.LogisticRegression(solver='lbfgs')

# Logistic Regression with CV
lrcv = {'name': 'Cross-Validated Logistic Regression'}
lrcv['model'] = linear_model.LogisticRegressionCV(Cs=100,
                                                  solver='lbfgs',
                                                  n_jobs=-1)

# SVC
svc = {'name': 'Support Vector Machine'}
svc['model'] = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True))
예제 #13
0
    X_test = pddtest.values[:, selection_feature_index[0:selection]]
    y_test = list(pddtest.iloc[:, 0])
    # #------------------------------------------------------------------------------------------------------
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    # ##模型选择 分类器参数选择---------------------------------------------------------------------------------
    RF = RandomForestClassifier(n_estimators=320, criterion='gini', max_depth=8
                                , min_samples_split=12, min_samples_leaf=3, min_weight_fraction_leaf=0.06,
                                max_features='auto'
                                , max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                bootstrap=True
                                , oob_score=False, n_jobs=None, random_state=0, verbose=0, warm_start=False,
                                class_weight=None)
    knn = KNeighborsClassifier(n_neighbors=40, n_jobs=1)  # K临近算法
    naiveB = naive_bayes.BernoulliNB(alpha=1.6, binarize=1.41, fit_prior=True, class_prior=None)  # 0.575

    svm = SVC(C=100, kernel='rbf', gamma=0.01, probability=True)
    LR = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=4.0
                            , fit_intercept=True, intercept_scaling=2, class_weight=None
                            , random_state=None, solver='liblinear', max_iter=100, multi_class='ovr'
                            , verbose=0, warm_start=False, n_jobs=1)
    eclf = VotingClassifier(estimators=[('RF', RF), ('knn', knn), ('naiveB', naiveB), ('svm', svm), ('LR', LR)],
                            voting='soft', weights=[1, 1, 1, 1, 1])
    # # #训练模型---------------------------------------------------------------------------------------------
    RF.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    naiveB.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    eclf.fit(X_train, y_train)
    LR.fit(X_train, y_train)
예제 #14
0
def runBer(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.BernoulliNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model
                                        binary=True,
                                        ngram_range=NGRAM_RANGE)

    #vectorizer_binary = CountVectorizer(stop_words=stopWords, min_df=MIN_DF, binary=True, ngram_range=(1, 2))
    #vectorizer_binary = CountVectorizer(stop_words=stopWords, min_df=MIN_DF, binary=True, ngram_range=(1, 3))
    vectorizer_tfidf = TfidfVectorizer(stop_words=stopWords,
                                       min_df=MIN_DF,
                                       ngram_range=NGRAM_RANGE)
    #vectorizer = vectorizer_tfidf
    vectorizer = vectorizer_binary
    print(vectorizer)

    #clf = linear_model.LogisticRegression(penalty='l2', C=1.2)
    _ = linear_model.LogisticRegression()
    _ = svm.LinearSVC()
    _ = naive_bayes.BernoulliNB(
    )  # useful for binary inputs (MultinomialNB is useful for counts)
    _ = naive_bayes.GaussianNB()
    _ = naive_bayes.MultinomialNB()
    _ = ensemble.AdaBoostClassifier(n_estimators=100,
                                    base_estimator=tree.DecisionTreeClassifier(
                                        max_depth=2, criterion='entropy'))
    #clf = ensemble.AdaBoostClassifier(n_estimators=100, base_estimator=tree.DecisionTreeClassifier(max_depth=2))
    _ = tree.DecisionTreeClassifier(max_depth=50, min_samples_leaf=5)
    #clf = tree.DecisionTreeClassifier(max_depth=2, min_samples_leaf=5, criterion='entropy')
    #clf = ensemble.RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_estimators=10, oob_score=False, n_jobs=-1, criterion='entropy')
    _ = ensemble.RandomForestClassifier(max_depth=10,
                                        min_samples_leaf=5,
                                        n_estimators=50,
                                        n_jobs=-1,
                                        criterion='entropy')
    #clf = ensemble.RandomForestClassifier(max_depth=30, min_samples_leaf=5, n_estimators=100, oob_score=True, n_jobs=-1)
예제 #16
0
print("----------K-NEAREST NEIGHBOR-------------------")
# Build a sequence of models for k = 2, 4, 6, 8, ..., 20.
ks = [16]
for k in ks:
    # Create model and fit.
    mod_knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    mod_knn.fit(x_train, y_train)

    # Make predictions - both class labels and predicted probabilities.
    predsknn = mod_knn.predict(x_test)
    print(' EVALUATING MODEL: k = ' + str(k))
    # Look at results.
    print_binary_classif_error_report(y_test, predsknn)

print("\n----------Bernoulli - NAIVE BAYESIAN-------------------")
bnb_mod = naive_bayes.BernoulliNB()
bnb_mod.fit(x_train, y_train)
predsbnb = bnb_mod.predict(x_test)
print_binary_classif_error_report(y_test, predsbnb)

print("\n----------DECISION TREE-------------------")
print("DTREE WITH GINI IMPURITY CRITERION:")
dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini')
dtree_gini_mod.fit(x_train, y_train)
preds_gini = dtree_gini_mod.predict(x_test)
print_binary_classif_error_report(y_test, preds_gini)

print("\n----------SUPPORT VECTOR MACHINE-------------------")
# Build a sequence of models for different n_est and depth values.
cs = [2.0]
for c in cs:

# Naive Bayes
print("Naive Bayes với tfidf")
train_model(naive_bayes.MultinomialNB(),
            X_data_tfidf,
            y_data_n,
            X_test_tfidf,
            y_test_n,
            is_neuralnet=False)
########### kết quả Naive Bayes với tfidf:
# Validation accuracy:  0.7211404728789986
# Test accuracy:  0.7024677045379265

print("BernoulliNB với tfidf")
train_model(naive_bayes.BernoulliNB(),
            X_data_tfidf,
            y_data_n,
            X_test_tfidf,
            y_test_n,
            is_neuralnet=False)
########### kết quả BernoulliNB với tfidf
# Validation accuracy:  0.760778859527121
# Test accuracy:  0.7232527326929447

print("BernoulliNB với tfidf SVD")
train_model(naive_bayes.BernoulliNB(),
            X_data_tfidf_svd,
            y_data_n,
            X_test_tfidf_svd,
            y_test_n,
예제 #18
0
data.extend(ham_processed_texts)
data.extend(spam_processed_texts)
textMatrix = transformTextToSparseMatrix(data)
textMatrix.head()
features = pd.DataFrame(textMatrix.apply(sum,axis=0))
extractedfeatures = [features.index[i] for i in range(features.shape[0]) if features.iloc[i,0]>5]
textMatrix = textMatrix[extractedfeatures]
textMatrix = textMatrix[extractedfeatures]
labels = []
labels.extend(ones(5000))
labels.extend(zeros(5001))
# split into a train and a validation data set
train,test,trainlabel,testlabel = train_test_split(textMatrix,labels,test_size=0.1)

# bayes
clf = bayes.BernoulliNB(alpha=1,binarize=True)
model = clf.fit(train,trainlabel)
# SVM
model2 = LinearSVC()
model2.fit(train,trainlabel)

print(model.score(test,testlabel))
print(model2.score(test,testlabel))

0.922077922077922
0.987012987012987
import matplotlib.pyplot as plt
from pylab import *                                 #to support Chinese
mpl.rcParams['font.sans-serif'] = ['SimHei']

names = ['1000', '2000', '4000', '6000', '8000','9000']
예제 #19
0
test_data = pd.concat([hour, days, year, block, district, xs, ys], axis=1)

del days, district, hour, month, year, block, xs, ys, crime

# MODELS =======================================================================================
"""
Choose the model you want, fit it with the data,
then you can perform some bagging or
some boosting on the base model you have. 
"""
# Build up the features
features = list(train_data.columns[:-1])

# Base-Model construction
begin = time.time()
baseModel = naive_bayes.BernoulliNB()
baseModel.fit(train_data[features], train_data['Crime'])
showExecTime(begin, "BernoulliNB fitted.")

# Logistic Regression
begin = time.time()
lr = linear_model.LogisticRegression(solver='lbfgs')
lr.fit(train_data[features], train_data['Crime'])
showExecTime(begin, "LogisticRegression fitted.")

# Random Forest
begin = time.time()
randomForest = ensemble.RandomForestClassifier(n_estimators=10,
                                               n_jobs=-1,
                                               min_samples_leaf=50)
randomForest.fit(train_data[features], train_data['Crime'])
data.extend(ham_mails)
data.extend(spam_mails)

textMatrix = transformTextToSparseMatrix(data)
# textMatrix.head() #展示一下这个稀疏矩阵

features = pd.DataFrame(textMatrix.apply(sum, axis=0))
print(features)  #展示总词频数
#抽取出在至少5封邮件都出现的词来作为特征,以减少特征维度
extractedfeatures = [
    features.index[i] for i in range(features.shape[0])
    if features.iloc[i, 0] > 5
]  #得到的是一个下标序列
textMatrix = textMatrix[extractedfeatures]
textMatrix = textMatrix[extractedfeatures]
labels = []
labels.extend(ones(len(ham_mails)))
labels.extend(zeros(len(spam_mails)))
# 划分训练集和测试集
train, test, trainlabel, testlabel = train_test_split(textMatrix,
                                                      labels,
                                                      test_size=0.3)

BYM_lb = bayes.BernoulliNB(alpha=1, binarize=True)  #sklearn的朴素贝叶斯分类器
model = BYM_lb.fit(train, trainlabel)  # 调用贝叶斯库函数求解模型

print(train.shape)
print(len(trainlabel))
print(trainlabel[:int(sqrt(len(trainlabel)))])
print("识别准确率=", model.score(test, testlabel))
예제 #21
0
# score_test = dt.score(test_col, test_ans)
# print('testing acc:',score_test)

#naive bayes
par_smooth = 0.003
# for i in range(10):

NB1 = naive_bayes.GaussianNB(var_smoothing=par_smooth)
NB1.fit(x_train, y_train)
score_train = NB1.score(x_train, y_train)
print('Gaussian NB training acc:', score_train)
# score_test = NB1.score(test_col, test_ans)
# print('Gaussian NB testing acc:',score_test)
# par_smooth=par_smooth+0.001

NB2 = naive_bayes.BernoulliNB(alpha=0.1, fit_prior=False)
NB2.fit(x_train, y_train)
score_train = NB2.score(x_train, y_train)
print('Bernoulli NB training acc:', score_train)
# score_test = NB2.score(test_col, test_ans)
# print('Bernoulli NB testing acc:',score_test)

NB3 = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=False)
NB3.fit(x_train, y_train)
score_train = NB3.score(x_train, y_train)
print('Multinomial NB training acc:', score_train)
# score_test = NB3.score(test_col, test_ans)
# print('Multinomial NB testing acc:',score_test)
multi_pred = NB3.predict(test_col)

if sys.argv[1] == 'D':
def retornaModeloTreinado(model, validacao):
    if validacao == 'aprovado':
        X = train_data_aprovado
        y = target_data_aprovado
    elif validacao == 'evasao':
        X = train_data_evasao
        y = target_data_evasao

    return model.fit(X, y)


# escolhidos para aprovado/reprovado
logistic_regression_aprovado = LogisticRegression(penalty='l1')
svm_aprovado = svm.SVC()
knn_aprovado = neighbors.KNeighborsClassifier(weights='distance')
naive_bayes_aprovado = naive_bayes.BernoulliNB()

# escolhidos para evasao
logistic_regression_evasao = LogisticRegression(solver='newton-cg')
svm_evasao = svm.SVC()
knn_evasao = neighbors.KNeighborsRegressor(n_neighbors=7)
naive_bayes_evasao = naive_bayes.BernoulliNB()

# salvando os modelos treinados

joblib.dump(retornaModeloTreinado(knn_aprovado, 'aprovado'),
            'model/model_si_aprovado_knn.pkl')
joblib.dump(retornaModeloTreinado(knn_evasao, 'evasao'),
            'model/model_si_evasao_knn.pkl')
print('ok')
예제 #23
0
    body_title_corpus.append(np.concatenate((corpus_data_features_nd[i], corpus_data_features_nd1[i])))

body_title_corpus = array(body_title_corpus)

for i in range(len(corpus_data_features_nd)):
    body_title_tags_corpus.append(np.concatenate((body_title_corpus[i], corpus_data_features_nd2[i])))

body_title_tags_corpus = array(body_title_tags_corpus)

print body_title_tags_corpus    

X_train, X_test, y_train, y_test  = train_test_split(body_title_tags_corpus[0:len(train_data_df)], train_data_df.Popularity, random_state=2) 

print body_title_tags_corpus.shape

nb_model = naive_bayes.BernoulliNB()
#nb_model = naive_bayes.MultinomialNB()
#nb_model = naive_bayes.GaussianNB()

nb_model = nb_model.fit(X=X_train, y=y_train)

y_pred = nb_model.predict(X_test)

# get predictions

accu_score = cross_val_score(nb_model,X_test,y_pred,cv=10,scoring='accuracy').mean()
print "\n"
print "accuracy score : ",accu_score  

precision_score = cross_val_score(nb_model,X_test,y_pred,cv=10,scoring='precision').mean()
#print "\n"
예제 #24
0
def bayes_train(type, ratio):
    print "begin traning bayes..."
    with open(dump_file) as f:
        x = cPickle.load(f)
        y = cPickle.load(f)
        idx = cPickle.load(f)
        x = x.tocsr()
        y = np.array(y.todense()).ravel()
        idx = np.array(idx.todense()).ravel()
    print "load data complete"
    x = preprocessing.normalize(x)

    x, y, idx = shuffle(x, y, idx, random_state=42)

    if type == "m":
        clf = naive_bayes.MultinomialNB()
    else:
        clf = naive_bayes.BernoulliNB()

    train_len = int(0.8 * ratio * y.shape[0])
    test_len = ratio * y.shape[0] - train_len
    train_x = x[:train_len]
    test_x = x[train_len:ratio * y.shape[0]]
    train_y = y[:train_len]
    test_y = y[train_len:ratio * y.shape[0]]
    test_idx = idx[train_len:ratio * y.shape[0]]

    clf.fit(train_x, train_y)
    pred_y = clf.predict(test_x)

    acc = accuracy_score(test_y, pred_y)
    score = np.empty([3, len(tags)], dtype=float)
    score[0], score[1], score[2], support = precision_recall_fscore_support(
        test_y, pred_y)
    macro = np.mean(score, axis=1)
    micro = np.mean(score * support, axis=1) / np.mean(support)

    f = open("error_log.txt", "w")
    for i in range(test_y.shape[0]):
        if test_y[i] == pred_y[i]:
            continue
        sql = "select url, comment from car.info where id = %d" % test_idx[i]
        cur.execute(sql)
        url, comment = cur.fetchone()
        info = "\npredict: %s\ntrue: %s\n%s\n%s\n" % (
            tags[pred_y[i]], tags[test_y[i]], url, comment)
        f.write(info)
    f.close()
    # with open("result.txt", "a") as f:
    # 	info = "\nBayes with %s and num %d: \n" % (type, train_len)
    # 	print info
    # 	f.write(info)
    # 	f.write(str(acc))
    print acc
    print score[0]
    print macro[0], micro[0]
    print score[1]
    print macro[1], micro[1]
    print score[2]
    print macro[2], micro[2]
    print support
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    # Generalized Linear Models
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    # Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    # Support Vector Machine
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    # Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    # Discriminant Analysis
예제 #26
0
def main_3():
    x_data = datasets.load_boston().data
    naive_bayes.BernoulliNB()
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    #gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.LogisticRegression(C=100, random_state=0, solver='liblinear'),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
    naive_bayes.BernoulliNB(),
    #naive_bayes.GaussianNB(),

    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    #Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    #Discriminant Analysis
예제 #28
0
    ensemble.ExtraTreesClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, random_state= 0),
    ensemble.GradientBoostingClassifier(learning_rate= 0.05, max_depth= 2, n_estimators= 300, random_state= 0),
    ensemble.RandomForestClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, oob_score= True, random_state= 0),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(max_iter_predict= 10, random_state= 0),

    #GLM
    linear_model.LogisticRegressionCV(fit_intercept= True, random_state= 0, solver= 'liblinear'),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
    naive_bayes.BernoulliNB(alpha= 0.1),
    naive_bayes.GaussianNB(),

    #Nearest Neighbor
    neighbors.KNeighborsClassifier(algorithm= 'brute', n_neighbors= 7, weights= 'uniform'),

    #SVM
    svm.SVC(C= 2, decision_function_shape= 'ovo', gamma= 0.1, probability= True, random_state= 0),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    #Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    #Discriminant Analysis
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

kbestfilter = SelectKBest(chi2,k=500)

train_features = kbestfilter.fit_transform(dataset_small.get_train_features(),
                     dataset_small.get_train_labels())
test_features = kbestfilter.transform(dataset_small.get_test_features())                     


##
threshold = 0.8*(1-0.8)
sel_var = VarianceThreshold(threshold = threshold)
sel_var.fit(np.sign(dataset_small.get_train_features()))

train_selected_features = sel_var.transform(dataset_small.get_train_features())
test_selected_features = sel_var.transform(dataset_small.get_test_features())

## train naive bayes
import sklearn.naive_bayes as naive_bayes
bnb = naive_bayes.BernoulliNB()

spam_filter = bnb.fit(np.sign(train_selected_features), 
                      dataset_small.get_train_labels())
spam_pred   = spam_filter.predict(test_selected_features)

## evaluate goodness of prediction
import sklearn.metrics
report = sklearn.metrics.classification_report(dataset_small.get_test_labels(),
                                     spam_pred)
예제 #30
0
def autoTuning(X, y):
    cv_split = model_selection.ShuffleSplit(n_splits=10,
                                            test_size=.3,
                                            train_size=.6,
                                            random_state=0)

    #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
    vote_est = [
        #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
        ('ada', ensemble.AdaBoostClassifier()),
        ('bc', ensemble.BaggingClassifier()),
        ('etc', ensemble.ExtraTreesClassifier()),
        ('gbc', ensemble.GradientBoostingClassifier()),
        ('rfc', ensemble.RandomForestClassifier()),

        #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
        ('gpc', gaussian_process.GaussianProcessClassifier()),

        #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
        ('lr', linear_model.LogisticRegressionCV()),

        #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
        ('bnb', naive_bayes.BernoulliNB()),
        ('gnb', naive_bayes.GaussianNB()),

        #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
        ('knn', neighbors.KNeighborsClassifier()),

        #SVM: http://scikit-learn.org/stable/modules/svm.html
        ('svc', svm.SVC(probability=True)),

        #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        ('xgb', XGBClassifier())
    ]

    #Hard Vote or majority rules
    vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
    vote_hard_cv = model_selection.cross_validate(vote_hard, X, y, cv=cv_split)
    vote_hard.fit(X, y)

    print("Hard Voting Training w/bin score mean: {:.2f}".format(
        vote_hard_cv['train_score'].mean() * 100))
    print("Hard Voting Test w/bin score mean: {:.2f}".format(
        vote_hard_cv['test_score'].mean() * 100))
    print("Hard Voting Test w/bin score 3*std: +/- {:.2f}".format(
        vote_hard_cv['test_score'].std() * 100 * 3))
    print('-' * 10)

    #Soft Vote or weighted probabilities
    vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft')
    vote_soft_cv = model_selection.cross_validate(vote_soft, X, y, cv=cv_split)
    vote_soft.fit(X, y)

    print("Soft Voting Training w/bin score mean: {:.2f}".format(
        vote_soft_cv['train_score'].mean() * 100))
    print("Soft Voting Test w/bin score mean: {:.2f}".format(
        vote_soft_cv['test_score'].mean() * 100))
    print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format(
        vote_soft_cv['test_score'].std() * 100 * 3))
    print('-' * 10)

    #Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    grid_n_estimator = range(10, 300, 50)
    grid_ratio = [.1, .25, .5, .75, 1.0]
    grid_learn = [.01, .03, .05, .75, .1, .15, .25]
    grid_max_depth = [2, 3, 4, 5, 6, 7, None]
    grid_min_samples = [5, 10, .03, .05, .10]
    grid_criterion = ['gini', 'entropy']
    grid_bool = [True, False]
    grid_seed = [0]

    grid_param = [
        [{
            #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
            'n_estimators': grid_n_estimator,  #default=50
            'learning_rate': grid_learn,  #default=1
            #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
            'random_state': grid_seed
        }],
        [{
            #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
            'n_estimators': grid_n_estimator,  #default=10
            'max_samples': grid_ratio,  #default=1.0
            'random_state': grid_seed
        }],
        [{
            #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
            'n_estimators': grid_n_estimator,  #default=10
            'criterion': grid_criterion,  #default=”gini”
            'max_depth': grid_max_depth,  #default=None
            'random_state': grid_seed
        }],
        [{
            #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
            #'loss': ['deviance', 'exponential'], #default=’deviance’
            'learning_rate': [
                .05
            ],  #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
            'n_estimators': [
                300
            ],  #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
            #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
            'max_depth': grid_max_depth,  #default=3   
            'random_state': grid_seed
        }],
        [{
            #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
            'n_estimators': grid_n_estimator,  #default=10
            'criterion': grid_criterion,  #default=”gini”
            'max_depth': grid_max_depth,  #default=None
            'oob_score': [
                True
            ],  #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
            'random_state': grid_seed
        }],
        [{
            #GaussianProcessClassifier
            'max_iter_predict': grid_n_estimator,  #default: 100
            'random_state': grid_seed
        }],
        [{
            #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
            'fit_intercept': grid_bool,  #default: True
            #'penalty': ['l1','l2'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                       'saga'],  #default: lbfgs
            'random_state': grid_seed
        }],
        [{
            #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
            'alpha': grid_ratio,  #default: 1.0
        }],

        #GaussianNB -
        [{}],
        [{
            #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
            'n_neighbors': [6, 7, 8, 9, 10, 11, 12, 14, 16, 18, 20,
                            22],  #default: 5
            'weights': ['uniform', 'distance'],  #default = ‘uniform’
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'leaf_size': list(range(1, 50, 5))
        }],
        [{
            #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
            #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
            #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'C': [1, 2, 3, 4, 5],  #default=1.0
            'gamma': grid_ratio,  #edfault: auto
            'decision_function_shape': ['ovo', 'ovr'],  #default:ovr
            'probability': [True],
            'random_state': grid_seed
        }],
        [{
            #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
            'learning_rate': grid_learn,  #default: .3
            'max_depth': [1, 2, 4, 6, 8, 10],  #default 2
            'n_estimators': grid_n_estimator,
            'seed': grid_seed
        }]
    ]

    start_total = time.perf_counter(
    )  #https://docs.python.org/3/library/time.html#time.perf_counter
    for clf, param in zip(
            vote_est,
            grid_param):  #https://docs.python.org/3/library/functions.html#zip

        #print(clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm
        #print(param)

        start = time.perf_counter()
        best_search = model_selection.GridSearchCV(estimator=clf[1],
                                                   param_grid=param,
                                                   cv=cv_split,
                                                   scoring='roc_auc',
                                                   n_jobs=-1)
        best_search.fit(X, y)
        run = time.perf_counter() - start

        best_param = best_search.best_params_
        print(
            'The best {} parameter for {} is {} with a runtime of {:.2f} seconds.'
            .format(best_search.best_score_, clf[1].__class__.__name__,
                    best_param, run))
        clf[1].set_params(**best_param)

    run_total = time.perf_counter() - start_total
    print('Total optimization time was {:.2f} minutes.'.format(run_total / 60))

    print('-' * 10)

    #%% [markdown]
    # # Submission

    #%%
    #Hard Vote or majority rules w/Tuned Hyperparameters
    grid_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
    grid_hard_cv = model_selection.cross_validate(grid_hard, X, y, cv=cv_split)
    grid_hard.fit(X, y)

    print(
        "Hard Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}"
        .format(grid_hard_cv['train_score'].mean() * 100))
    print("Hard Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}".
          format(grid_hard_cv['test_score'].mean() * 100))
    print(
        "Hard Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}"
        .format(grid_hard_cv['test_score'].std() * 100 * 3))
    print('-' * 10)

    #Soft Vote or weighted probabilities w/Tuned Hyperparameters
    grid_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft')
    grid_soft_cv = model_selection.cross_validate(grid_soft, X, y, cv=cv_split)
    grid_soft.fit(X, y)

    print(
        "Soft Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}"
        .format(grid_soft_cv['train_score'].mean() * 100))
    print("Soft Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}".
          format(grid_soft_cv['test_score'].mean() * 100))
    print(
        "Soft Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}"
        .format(grid_soft_cv['test_score'].std() * 100 * 3))
    print('-' * 10)