예제 #1
0
def NB_experiment(data_fold, train, test, dumper):

    print "Ready to find the Best Parameters for Naive Bayes"

    print 'Gaussian Naive Bayes'
    nb = GNB()
    print "fitting NaiveBayes Experiment"

    dumper.write('Classifier: Naive Bayes\n')
    scores = cross_validation.cross_val_score(nb,
                                              train[0],
                                              train[1],
                                              cv=data_fold,
                                              score_func=accus)

    reports = "Accuracy on Train: %0.2f (+/- %0.2f)" % (scores.mean(),
                                                        scores.std() / 2)
    print reports

    dumper.write(reports + '\n')
    reports = " ".join(['%0.2f' % (item) for item in scores])
    dumper.write(reports + '\n')

    nb = GNB()
    nb.fit(train[0], train[1])

    pred = clf_test(nb, test)
    output_ranking(pred, codecs.open('nb.ranking', 'w', 'utf-8'))
    return None
예제 #2
0
 def get_new_model(self):
     if (self.model_type.split("_")[-1] == "Regressor"):
         if (self.model_type == "Linear-Regressor"):
             from sklearn.linear_model import LinearRegression
             self.model = LinearRegression(**self.model_args)
         elif (self.model_type == "Support-Vector-Regressor"):
             import sklearn.svm as SVR
             self.model = SVR(**self.model_args)
         elif (self.model_type == "Decision-Tree-Regressor"):
             from sklearn.tree import DecisionTreeRegressor as DTR
             self.model = DTR(**self.model_args)
         elif (self.model_type == "Random-Forest-Regressor"):
             from sklearn.ensemble import RandomForestRegressor as RFR
             self.model = RFR(**self.model_args)
     else:
         if (self.model_type == "Logistic-Regression-Classifier"):
             from sklearn.linear_model import LogisticRegression
             self.model = LogisticRegression(**self.model_args)
         elif (self.model_type == "KNN-Classifier"):
             from sklearn.neighbors import KNeighborsClassifier as KNN
             self.model = KNN(**self.model_args)
         elif (self.model_type == "Support-Vector-Classifier"):
             import sklearn.svm as SVC
             self.model = SVC(**self.model_args)
         elif (self.model_type == "Naive-Bayes-Classifier"):
             from sklearn.naive_bayes import GNB
             self.model = GNB(**self.model_args)
         elif (self.model_type == "Decision-Tree-Classifier"):
             from sklearn.tree import DecisionTreeClassifier as DTC
             self.model = DTC(**self.model_args)
         elif (self.model_type == "Random-Forest-Classifier"):
             from sklearn.ensemble import RandomForestClassifier as RFC
             self.model = RFC(**self.model_args)
예제 #3
0
def gbn_word2vec():
    """"""
    model_GNB = GNB()
    train_data, test_data, label, train, test = get_data()
    path = "../data/word2vec-nlp"
    model_name = "%s/%s" % (path, "300features_40minwords_10context")
    model = Word2Vec.load(model_name)
    train_data_vecs = word2vec_model.get_avg_feature_vecs(
        train_data, model, 300)
    test_data_vecs = word2vec_model.get_avg_feature_vecs(test_data, model, 300)
    model_GNB.fit(train_data_vecs, label)

    print(
        "高斯贝叶斯分类器10折交叉验证得分: ",
        np.mean(
            cross_val_score(model_GNB,
                            train_data_vecs,
                            label,
                            cv=10,
                            scoring='roc_auc')))

    print('保存结果...')
    result = model_GNB.predict(test_data_vecs)
    submission_df = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
    print(submission_df.head(10))
 def NLMmodelexp1():
     modelExperiment(
         nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV,
         [LR(), DT(), KNC(), RF(),
          ABC(), GNB(), QDA()], [
              'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
              'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
          ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True)
    def train(self):
        logging.info('-' * 20)
        logging.info('Start training the %s model', self.model)
        train_data = self.feature_extractor.extract_feature(
            self.data_loader.get_trainset())
        if self.model == 'GNB':
            # Gaussian naive bayes
            self.classifier = GNB()
        elif self.model == 'BNB':
            # Bernoulli naive bayes
            self.classifier = BNB()
            # self.tok = RT(r'\w+')
            # vectorizer = Vectorizer(tokenizer=self.tok.tokenize)
            # train_data = self.data_loader.get_trainset()
            # train_data = [vectorizer.fit_transform(train_data[0]).toarray(), train_data[1]]
            # self.vocabulary = vectorizer.get_feature_names()
        elif self.model == 'MNB':
            # Multinomial naive bayes
            self.classifier = MNB()
        elif self.model == 'LR':
            # Logistic regression
            param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]}
            self.classifier = GS(cv=5,
                                 estimator=LR(penalty=self.penalty,
                                              max_iter=self.epoch,
                                              solver='liblinear'),
                                 param_grid=param)
        elif self.model == 'SVM':
            # Support vector machine
            self.penalty = self.penalty if self.penalty in ['l1', 'l2'
                                                            ] else 'l2'
            dual = self.penalty == 'l2'
            #self.classifier = SVM(penalty=self.penalty, C=self.c, max_iter=self.epoch, dual=dual)
            param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]}
            self.classifier = GS(cv=5,
                                 estimator=SVM(penalty=self.penalty,
                                               dual=dual,
                                               max_iter=self.epoch),
                                 param_grid=param)

        elif self.model == 'R':
            # RandomGuess
            self.classifier = DC(strategy='stratified')
        else:
            logging.info('Unsupported model : %s', self.model)
            exit(0)

        self.classifier.fit(train_data[0], train_data[1])
        self.classifier.predict(train_data[0])
        predictions = self.classifier.predict(train_data[0])
        acc = evaluator.accuracy_score(train_data[1], predictions)
        return acc
예제 #6
0
def main():
    mnist = input_data.read_data_sets('MNIST_DATA', one_hot=False)
    clf1 = LR()
    clf2 = RFC()
    clf3 = GNB()
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                        ('gnb', clf3)],
                            voting='soft')
    X = mnist.train.images
    y = mnist.train.labels
    print('starting')
    eclf = eclf.fit(X, y)
    print(eclf.score(X, y), eclf.score(mnist.test.images, mnist.test.labels))
 def SOmodelexp1():
     modelExperiment(
         SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV,
         [LR(),
          DT(),
          KNC(),
          RF(n_estimators=200),
          ABC(),
          GNB(),
          QDA()], [
              'LogisticRegression', 'DTree', 'KNN', 'RandomForest',
              'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis'
          ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True)
예제 #8
0
    def create_estimator(self):
        """Method that instantiates an estimator"""

        estimator = None
        if mlc.is_SVM_id(self.estimator_id):  ## SVM
            estimator = SVC()
            estimator.set_params(**self.SVM_params)
        elif mlc.is_RandomForest_id(self.estimator_id):  ## RF
            estimator = RF()
            estimator.set_params(**self.RF_params)
        elif mlc.is_NaiveBayes_id(self.estimator_id):  ## GNB
            estimator = GNB()
            estimator.set_params(**self.GNB_params)
        return estimator
예제 #9
0
    def __init__(self, **kwargs):
        r"""Initialize GaussianNB instance.
        """
        warnings.filterwarnings(action='ignore',
                                category=ChangedBehaviorWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataConversionWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataDimensionalityWarning)
        warnings.filterwarnings(action='ignore', category=EfficiencyWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=NonBLASDotWarning)
        warnings.filterwarnings(action='ignore',
                                category=UndefinedMetricWarning)

        self.__gaussian_nb = GNB()
        super(GaussianNB, self).__init__()
예제 #10
0
파일: main.py 프로젝트: bbruhh/stacking
def main():
    digits = load_digits()
    noised_data = digits.data + np.random.random(digits.data.shape) * 15

    X_train, X_test, y_train, y_test = train_test_split(noised_data,
                                                        digits.target,
                                                        test_size=0.8)

    svm = SVC(C=5, gamma=0.001, probability=True)
    lr = LogisticRegression()
    knn = KNN(n_jobs=-1)
    nb = GNB()
    rfc = RFC(n_estimators=500, n_jobs=-1)
    bgg = BaggingClassifier(n_estimators=300, n_jobs=-1)
    mlp = MLPClassifier(hidden_layer_sizes=(40, 20), max_iter=1000)
    xgb = XGBClassifier(n_estimators=300, n_jobs=-1)

    estimators = list(
        zip(["svm", "lr", "knn", "nb", "rfc", "bgg", "mlp", "xgb"],
            [svm, lr, knn, nb, rfc, bgg, mlp, xgb]))

    for name, clf in estimators:
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        print(name)
        print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
            *precision_recall_fscore_support(y_test, preds, average="macro")))

    for v in ["hard", "soft"]:
        vc_hard = VotingClassifier(estimators, voting=v)
        vc_hard.fit(X_train, y_train)
        preds = vc_hard.predict(X_test)
        print(v, "voting")
        print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
            *precision_recall_fscore_support(y_test, preds, average="macro")))

    stcl = StackingClassifier(estimators, RFC(n_estimators=2000, n_jobs=-1))
    stcl.fit(X_train, y_train)
    preds = stcl.predict(X_test)
    print("stacking")
    print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format(
        *precision_recall_fscore_support(y_test, preds, average="macro")))
예제 #11
0
def learn():
    prep = preprocess()
    chunks = prep[0]
    test = prep[1]
    nbmodel = GNB()
    svmodel = SVC(gamma='auto', kernel='linear')
    nbnum = 0
    svnum = 0

    for j in range(len(chunks)):
        chunks.insert(0, chunks.pop(-1))
        dev = chunks[0]
        train = [[], []]
        for t in chunks[1:]:
            train[0] += t[0]
            train[1] += t[1]

        nbmodel.fit(train[0], train[1])
        data = nbmodel.predict(dev[0])
        for i in range(len(data)):
            if data[i] == dev[1][i]:
                nbnum += 1
        print('Finished GNB loop {}'.format(j + 1))
        print(nbnum / len(data) / (j + 1))
        print()

        svmodel.fit(train[0], train[1])
        data = svmodel.predict(dev[0])
        for i in range(len(data)):
            if data[i] == dev[1][i]:
                svnum += 1
        print('Finished SVM loop {}'.format(j + 1))
        print(svnum / len(data) / (j + 1))
        print()

    if svnum >= nbnum:
        return testm(svmodel, chunks, test)
    else:
        return testm(nbmodel, chunks, test)
예제 #12
0
    def bayes(self, X, y, valid, test):
        # Using data priors worked best
        nb_model = GNB()

        start = time.time()
        nb_model.fit(X, y)
        end = time.time()

        # TRAIN DATA

        # y_score = nb_model.predict_proba(X)[:, 1]
        # results = nb_model.predict(X)
        #
        # # Get metrics
        # mets = self.compute_metrics(y, results, y_score)
        #
        # print('AUROC:', mets['auroc'])
        # print('Accuracy:', mets['accuracy'])
        # print('Precision:', mets['precision'])
        # print('Recall:', mets['recall'])
        # print('F Score:', mets['f'])
        # print('Average Precision', mets['ap'])
        # print(mets['confusion'])

        # VALID DATA

        # y_score = nb_model.predict_proba(valid.drop("Class", axis=1).drop("Time", axis=1))[:, 1]
        # results = nb_model.predict(valid.drop("Class", axis=1).drop("Time", axis=1))
        #
        # # Get metrics
        # mets = self.compute_metrics(valid["Class"], results, y_score)
        #
        # print('AUROC:', mets['auroc'])
        # print('Accuracy:', mets['accuracy'])
        # print('Precision:', mets['precision'])
        # print('Recall:', mets['recall'])
        # print('F Score:', mets['f'])
        # print('Average Precision', mets['ap'])
        # print(mets['confusion'])

        # TEST DATA

        y_score = nb_model.predict_proba(
            test.drop("Class", axis=1).drop("Time", axis=1))[:, 1]
        results = nb_model.predict(
            test.drop("Class", axis=1).drop("Time", axis=1))

        # Get metrics
        mets = self.compute_metrics(test["Class"], results, y_score)
        mets['time'] = end - start

        print('AUROC:', mets['auroc'])
        print('Accuracy:', mets['accuracy'])
        print('Precision:', mets['precision'])
        print('Recall:', mets['recall'])
        print('F Score:', mets['f'])
        print('Average Precision', mets['ap'])
        print(mets['confusion'], '\n')

        # Precision recall measure
        #self.plot_precision_recall(test["Class"], y_score, 'Naive Bayes')

        # Plot ROC
        #self.plotROC(mets['fpr'], mets['tpr'], mets['auroc'], 'Naive Bayes')

        return mets
예제 #13
0
X_train, X_validation, Y_train, Y_validation = m_s.train_test_split(
    X, Y, test_size=validation_size, random_state=seed)

# define 'scoring' parameter as 'accuracy'
scoring = 'accuracy'

# define array to hold candidate models
models = []

# instantiate candidate models and add to array
print('\n instantiating candidate models...')
models.append(('LR', LR()))
models.append(('LDA', LDA()))
models.append(('KNC', KNC()))
models.append(('DTC', DTC()))
models.append(('GNB', GNB()))

# run test harness
results = []
names = []
print('\n running test harness...')
for name, model in models:
    # 'kfold' var sets up the k-fold cross validation
    kfold = m_s.KFold(n_splits=10, random_state=seed)
    # 'cv_results' applies cross validation process to each model using the
    # training data i.e. features matrix X_train and results vector Y_train
    cv_results = m_s.cross_val_score(model,
                                     X_train,
                                     Y_train,
                                     cv=kfold,
                                     scoring=scoring)
    def make_feature_graph(self,
                           feature_list,
                           labels_filename="trainingSetLabels.dat"):
        '''
			Function to plot 2 graphs:
				1. Decision Boundaries: Takes atmost 2 features for every sample and plots decision boundaries defined by 5 classifiers: 
					['Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM', 'AdaBoost']
				2. Scatter Plot: Plots the values of each data point on a Scatter plot to visualise how separable they seem.
								 This is not performed on any classifier. For manual evaluation only. 

			Parametrs:
				feature_list: A list of lists containing the features for each sample.
				labels_filename: Path to the filename containing the labels for the training data
		'''

        y = []
        with open(labels_filename) as label_file:
            x_true_list = []
            x_fake_list = []
            for idx, label in enumerate(label_file):
                if int(label):
                    y.append(1)
                    x_true_list.append(feature_list[idx])
                else:
                    y.append(0)
                    x_fake_list.append(feature_list[idx])

        y = np.array(y)
        X_plot = feature_list

        #---------------------------- Decision Boundary Plot -----------------------#
        if len(feature_list[0]) == 1 or len(feature_list[0]) == 2:
            print "Now plotting Decision boundary Plot. (Works best for 2 features)"

            gs = gridspec.GridSpec(2, 2)

            fig = plt.figure(figsize=(10, 8))

            clf1 = LogisticRegression(random_state=1)
            clf2 = RFC(n_estimators=100, random_state=1)
            clf3 = GNB()
            clf4 = SVC()
            clf5 = ABC()

            labels = [
                'Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM',
                'AdaBoost'
            ]
            for clf, lab, grd in zip([clf1, clf2, clf3, clf4, clf5], labels,
                                     itertools.product([0, 1], repeat=2)):

                clf.fit(X_plot, y)
                ax = plt.subplot(gs[grd[0], grd[1]])
                fig = plot_decision_regions(X=X_plot, y=y, clf=clf, legend=2)
                plt.title(lab)

            plt.show()

        #---------------------------- Individual Scatter Plot -----------------------#
        plot_idx = 0
        if len(feature_list[0]) != 1:
            plot_idx = int(
                raw_input(
                    "Your list has more than 1 feature. Which feature would you like to observe? (Insert Index): "
                ))

        print "Now plotting scatter plot of feature:"
        x_true = [feat[plot_idx] for feat in x_true_list]
        x_fake = [feat[plot_idx] for feat in x_fake_list]

        x_true = np.array(x_true)
        x_fake = np.array(x_fake)
        y_plot = np.arange(max(len(x_true), len(x_fake)))

        trace_true = go.Scatter(y=x_true,
                                x=y_plot,
                                mode='markers',
                                text="True")
        trace_fake = go.Scatter(y=x_fake,
                                x=y_plot,
                                mode='markers',
                                text="Fake")

        data = [trace_true, trace_fake]
        layout = go.Layout(showlegend=False)
        fig = go.Figure(data=data, layout=layout)
        plot_url = offline.plot(fig, filename='text-chart-basic')
    def article_classifier(self):

        train_pos, dev_pos = self.pos_load_features()

        rare_ttr_perplexity_4gram_features = list(
            extractFourGram('featureFour.txt', 'basic.csv'))

        X_dev = list(extractFourGram('featureFour_dev.txt', 'basic_dev.csv'))
        y_dev = self.get_dev_labels()

        X = rare_ttr_perplexity_4gram_features
        y = self.labels

        X.append(train_pos)
        X_dev.append(dev_pos)

        X = np.array(X).T[:, :]
        X_dev = np.array(X_dev).T[:, :]

        # self.make_feature_graph(X[:,1:3],"trainingSetLabels.dat")

        lr_clf = LogisticRegression()
        lr_clf.fit(X, y)
        lr_predicted = lr_clf.predict(X_dev)
        lr_scores = cross_val_score(lr_clf, X, y, cv=5, n_jobs=5)
        print lr_scores, np.mean(lr_scores), np.std(lr_scores)
        # svm_predicted = cross_val_predict(lr_clf, X, y, cv=5)
        print accuracy_score(y_dev, lr_predicted)

        # SVM Parameters:
        # {'C': [0.1,1.0,10.0,100.0], 'gamma':[1.0,2.0,'auto',0.1,0.01,0.001], 'kernel':['rbf','linear']}
        svm_clf = SVC(probability=True)
        svm_clf.fit(X, y)
        svm_predicted = svm_clf.predict(X_dev)
        svm_scores = cross_val_score(svm_clf, X, y, cv=5, n_jobs=5)
        print svm_scores, np.mean(svm_scores), np.std(svm_scores)
        # svm_predicted = cross_val_predict(svm_clf, X, y, cv=5)
        print accuracy_score(y_dev, svm_predicted)

        # RandomForest Parameters:
        # {'n_estimators':[10,20,5,30],'criterion':['gini','entropy']}
        rf_clf = RFC()
        rf_clf.fit(X, y)
        rf_predicted = rf_clf.predict(X_dev)
        rf_scores = cross_val_score(rf_clf, X, y, cv=5, n_jobs=5)
        print rf_scores, np.mean(rf_scores), np.std(rf_scores)
        # rf_predicted = cross_val_predict(rf_clf, X, y, cv=5)
        print accuracy_score(y_dev, rf_predicted)

        # AdaBoost Parameters:
        # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]}
        ab_clf = ABC()
        ab_clf.fit(X, y)
        ab_predicted = ab_clf.predict(X_dev)
        ab_scores = cross_val_score(ab_clf, X, y, cv=5, n_jobs=5)
        print ab_scores, np.mean(ab_scores), np.std(ab_scores)
        # ab_predicted = cross_val_predict(ab_clf, X, y, cv=5)
        print accuracy_score(y_dev, ab_predicted)

        # Gaussian NB Parameters:
        # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]}
        nb_clf = GNB()
        nb_clf.fit(X, y)
        nb_predicted = nb_clf.predict(X_dev)
        nb_scores = cross_val_score(nb_clf, X, y, cv=5, n_jobs=5)
        print nb_scores, np.mean(nb_scores), np.std(nb_scores)
        # nb_predicted = cross_val_predict(nb_clf, X, y, cv=5)
        print accuracy_score(y_dev, nb_predicted)
예제 #16
0
            df['OCS3'] = replace_non_numeric(df['OCS3'], 'absent')
            df['OCS4'] = replace_non_numeric(df['OCS4'], 'absent')
            position_of_split = 10

        y_data = df.iloc[:, -1]
        x_data = df.iloc[:, 0:-1]

        print 'shape of data is ', x_data.shape, 'while shape of target is ', y_data.shape

        #%% Select base learners

        learnersX = [
            KNN(n_neighbors=5),
            EXTRA(n_estimators=30, random_state=rs_l),
            RF(n_estimators=30, random_state=rs_l),
            GNB(),
            GraB(random_state=rs_l)
        ]
        learnersY = [
            KNN(n_neighbors=5),
            EXTRA(n_estimators=30, random_state=rs_l),
            RF(n_estimators=30, random_state=rs_l),
            GNB(),
            GraB(random_state=rs_l)
        ]

        for ww in range(0, len(rs1_list)):

            flag = False  # a parameter for writing helpful headings to xls files
            w_count = 1
            sheet1 = book.add_sheet('sheet' + str(ww) + '_' + ff)
예제 #17
0
# pred_probas=clf.decision_function(x_test)#y_score
# precision_svm,recall_svm,auc_svm = get_pr(pred_probas,y_test)
precision_svm, recall_svm, _ = precision_recall_curve(y_test, pred_probas)
auc_svm = average_precision_score(y_test, pred_probas)
# precision,recall,_=precision_recall_curve(y_test,pred_probas)
# plt.step(recall, precision, color='b', alpha=0.2,
#          where='post')
# plt.fill_between(recall, precision, step='post', alpha=0.2,
#                  color='b')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.ylim([0.0, 1.05])
# plt.xlim([0.0, 1.0])

# 朴素贝叶斯,只适用于分类问题,比线性模型速度还快,适用于非常大的数据集和高维数据,但精度通常低于线性模型
gnb = GNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
print('Training Score: ', gnb.score(x_train, y_train))
print('Testing Score: ', gnb.score(x_test, y_test))
print(len(y_test), len(y_pred))
print('准确率: ', accuracy_score(y_test, y_pred))  #准确率,准确率是分类正确的样本占总样本个数的比例
print('精确率: ', precision_score(y_test,
                               y_pred))  #精确率指模型预测为正的样本中实际也为正的样本占被预测为正的样本的比例
print('召回率: ', recall_score(y_test, y_pred))  #召回率指实际为正的样本中被预测为正的样本所占实际为正的样本的比例
print("F1值: %.3f" % f1_score(y_test, y_pred))

pred_probas = gnb.predict_proba(x_test)[:, 1]  #score
precision_bays, recall_bays, _ = precision_recall_curve(y_test, pred_probas)
auc_bays = average_precision_score(y_test, pred_probas)
#逻辑回归
예제 #18
0

if __name__ == "__main__":

    #Loading the Dataset
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import GaussianNB as GNB
    from sklearn import metrics

    dataloader = load_breast_cancer()
    # keeping 80% as training data and 20% as testing data.
    X_train, X_test, y_train, y_test = train_test_split(dataloader.data,
                                                        dataloader.target,
                                                        test_size=0.2,
                                                        random_state=20)

    model0 = GNB()
    model = GaussianNB(num_classes=2)
    # model = SoftmaxClassifier()

    model0.fit(X_train, y_train)
    model.fit(X_train, y_train)
    y_pred0 = model0.predict(X_test)
    y_pred, _ = model.predict(X_test)

    accu0 = metrics.accuracy_score(y_pred0, y_test)
    accu = metrics.accuracy_score(y_pred, y_test)
    # accu = np.sum(np.equal(y_pred, y_test))/len(y_test)
    print(f"accu0 is {accu0} and accu is {accu}")
예제 #19
0
def get_model(name):
    return {
        "mock": ClassifierMock(), "lda": LDA(), "qda": QDA(), "gnb": GNB(), "knn": KNN(),
        "forest": RandomForestClassifier(), "logistic": LogisticRegression(class_weight="balanced"),
        "svm": SVC(kernel="linear", class_weight="balanced"),
    }[name]
예제 #20
0
data_all = pd.concat([data_all, Sex_dummies, Pclass_dummies, Embarked_dummies],
                     axis=1)

feature = [
    'Age', 'Fare', 'FamilySize', 'Cabin_null', 'Cabin_nnull', 'Sex_female',
    'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
    'Embarked_S'
]

X = data_all.loc[data.index][feature]
y = data.Survived

modelDict = {
    'DT': DT(),
    'SVC': SVC(),
    'GNB': GNB(),
    'KNN': KNN(n_neighbors=3),
    'MLP': MLP(hidden_layer_sizes=(500, )),
    'LogR': LogR(C=1.0, penalty='l1', tol=1e-6),
    'RF': RF(n_estimators=300),
    'GB': GB(n_estimators=500)
}

for model in modelDict.keys():
    clf = modelDict.get(model)
    scores = cross_val_score(clf, X, y, cv=5)
    print(model + ' accuracy: ' + '%.3f' % (scores.mean() * 100) + '%')

votingC = VotingClassifier(estimators=[('clf_GB', GB(n_estimators=500)),
                                       ('clf_RF', RF(n_estimators=300)),
                                       ('clf_SVC', SVC(probability=True)),
예제 #21
0
        "bumpiness": bumpy_bkg
    }
}

############################################################################################################
from sklearn.naive_bayes import GaussianNB as GNB

x_min = 0.0
x_max = 1.0
y_min = 0.0
y_max = 1.0

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].

clf = GNB()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

h = .01  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)
예제 #22
0
def NaiveBayes(x_train, x_test, y_train, y_test):
    gnb = GNB()
    clf = gnb.fit(x_train, y_train.ravel())
    score = clf.score(x_test, y_test)
    print("c_index = ", c_index, " 精度为", score)
예제 #23
0
def Bokeh_Decision_Boundaries_2D(X, Variable1, Variable2, y, Estimators = [LDA(),QDA(),KNN(),GNB(),TREE(), Random_Forest_Classifier(Rand_Param_Search = False), SVC(probability = True)], Notebook_Url = "None", Test_Size = 0.3, Random_State = None, Scale = True , Palette = "RdBu", Delta = 0.02, Output = 1):
    


    X=X
    Variable1 = Variable1
    Variable2 = Variable2
    y=y
    Estimators =Estimators 
    Notebook_Url = Notebook_Url 
    Test_Size =Test_Size
    Random_State = Random_State
    Scale = Scale 
    Palette = Palette 
    Delta = Delta
    Output = Output
    Estimator_Names = [str(estimator).split("(")[0] for estimator in Estimators]
           
    output_notebook()

    if Notebook_Url =="None":
        raise ValueError("Must specify the Notebook_Url i.e. localhost:port_number where port_number is the port on which the notebook is running")

    # def modify_doc(doc):
       
    # The callback for update
    def update_plots(Variable1, Variable2, y, Active_Estimators):

        print("Starting update")
        nonlocal Estimators
        if not isinstance(Estimators, (type(np.array), list)):
            Estimators = np.array([Estimators])
        
        estimator_names = np.array(list(Active_Estimators))
        ix = np.isin(Estimator_Names, estimator_names)
        estimator_indices =  [int(i) for i in np.where(ix)[0].flatten()]

        estimators = np.array(Estimators)[estimator_indices]
 
        variable1 = Variable1
        variable2 = Variable2
        y= y
    
        plots = [None for i in range(len(estimators))]
        image_sources = [None for i in range(len(estimators))]
        observation_sources = [None for i in range(len(estimators))]
        hover_tools = [None for i in range(len(estimators))]
        model_score_sources= [None for i in range(len(estimators))]
        glyphs0= [None for i in range(len(estimators))]
        color_bars= [None for i in range(len(estimators))]
        p_circles = [None for i in range(len(estimators))]
        p_images = [None for i in range(len(estimators))]
        

        
        #Iterate over the estimators
        for idx, estimator in enumerate(estimators):
            #Find the title for each plot
            estimator_name = str(estimator).split('(')[0]
            
            #Extract the needed data
            full_mat = X[[variable1, variable2, y]].dropna(how = "any", axis = 0)

            #Define a class bijection for class colour mapping
            unique_classes, y_bijection = np.unique(full_mat[y], return_inverse = True)
            full_mat['y_bijection'] = y_bijection
            
            #Rescale the X Data so that the data fits nicely on the axis/predictions are reliable
            full_mat[variable1 + "_s"] = StandardScaler().fit_transform(full_mat[variable1].values.reshape((-1,1)))
            full_mat[variable2 + "_s"] = StandardScaler().fit_transform(full_mat[variable2].values.reshape((-1,1)))

            #Define the Step size in the mesh
            delta = Delta 

            #Separate the data into arrays so it is easy to work with
            X1 = full_mat[variable1 + "_s"].values
            X2 = full_mat[variable2 + "_s"].values
            Y = full_mat["y_bijection"].values 

            #Define the mesh-grid co-ordiantes over which to colour in
            x1_min, x1_max = X1.min() -0.5, X1.max() +0.5
            x2_min, x2_max = X2.min() -0.5, X2.max() +0.5

            #Create the meshgrid itself
            x1, x2 = np.arange(x1_min, x1_max, delta), np.arange(x2_min, x2_max, delta)
            x1x1, x2x2 = np.meshgrid(x1, x2)

            #Create the train test split
            X_train, X_test, y_train, y_test = train_test_split(full_mat[[variable1+"_s",variable2+"_s"]], Y, test_size = Test_Size, random_state = Random_State)
            #Fit and predict/score the model
            model = estimator.fit(X= X_train, y= y_train)
            # train_preds = model.predict(X_train)
            # test_preds = model.predict(X_test)
            model_score = model.score(X_test, y_test)
            model_score_text = "Model score: %.2f" % model_score

            if hasattr(model, "decision_function"):
                Z = model.decision_function(np.c_[x1x1.ravel(), x2x2.ravel()])

            elif hasattr(model, "predict_proba"):
                Z = model.predict_proba(np.c_[x1x1.ravel(), x2x2.ravel()])
            
            else:
                print("This Estimator doesn't have a decision_function attribute and can't predict probabilities")

         
            Z = np.argmax(Z, axis = 1)  
            Z_uniques = np.unique(Z)

            unique_predictions = unique_classes[Z_uniques]

            Z = Z.reshape(x1x1.shape)

            #Add in the probabilities and predicitions for the tooltips
            full_mat["probability"] = np.amax(model.predict_proba(full_mat[[variable1 + "_s", variable2 + "_s"]]), axis = 1)
            
            bijected_predictions= model.predict(full_mat[[variable1 + "_s", variable2 + "_s"]])
            full_mat["prediction"] = unique_classes[bijected_predictions]

            #Add an associated color to the predictions   
            number_of_colors= len(np.unique(y_bijection))
            
            #Create the hover tool to be updated
            hover = HoverTool(tooltips = [
                (variable1,"@"+variable1),
                 (variable2, "@"+variable2),
                  ("Probability", "@probability"),
                   ("Prediction", "@prediction"),
                   ("Actual", "@"+y)])
            
            #Create the axes for all the plots
            plots[idx] = figure(x_axis_label = variable1, y_axis_label = variable2, title = estimator_name, x_range = (x1x1.min(),x1x1.max()),y_range =  (x2x2.min(),x2x2.max()), plot_height = 600, plot_width = 600)

            #Create all the image sources
            image_data = dict()
            image_data['x'] = np.array([x1x1.min()])
            image_data["y"] = np.array([x2x2.min()])
            image_data['dw'] = np.array([x1x1.max()-x1x1.min()])
            image_data['dh'] = np.array([x2x2.max() - x2x2.min()])
            image_data['boundaries'] = [Z]
            

            image_sources[idx] = ColumnDataSource(image_data)

            #Create all the updatable images (boundaries)
            p_images[idx] = plots[idx].image(image = 'boundaries', x= 'x', y = 'y', dw = 'dw', dh= 'dh', palette = "RdBu11", source = image_sources[idx])
            
            #Create the sources to update the observation points 
            observation_sources[idx] = ColumnDataSource(data = full_mat)

            #Create all the updatable points
            low = full_mat["y_bijection"].min()
            high = full_mat["y_bijection"].max()
            cbar_mapper = LinearColorMapper(palette = RdBu[number_of_colors], high = high, low = low)    
            
            p_circles[idx] = plots[idx].circle(x =variable1 +"_s", y= variable2 + "_s", color = dict(field = 'y_bijection', transform = cbar_mapper),  source = observation_sources[idx], line_color = "black")
           

            #Create the hovertool for each plot
            hover_tools[idx] = hover

            #Add the hover tools to each plot
            plots[idx].add_tools(hover_tools[idx])

            #Create all the text sources (model scores) for the plots
            model_score_sources[idx] = ColumnDataSource(data = dict(x=[x1x1.min()+0.3], y=[x2x2.min()+0.3], text=[model_score_text]))

            #Add the model scores to all the plots
            score_as_text = Text(x = "x", y = "y", text = "text")
            glyphs0[idx] = plots[idx].add_glyph(model_score_sources[idx], score_as_text)

            #Add a colorbar
            color_bars[idx] = ColorBar(color_mapper= cbar_mapper , ticker=BasicTicker(desired_num_ticks = number_of_colors), label_standoff=12, location=(0,0), bar_line_color = "black")

            plots[idx].add_layout(color_bars[idx],"right")
            plots[idx].add_tools(LassoSelectTool(), WheelZoomTool())
            
            # configure so that no drag tools are active
            plots[idx].toolbar.tools = plots[idx].toolbar.tools[1:]       
            plots[idx].toolbar.tools[0], plots[idx].toolbar.tools[-2] = plots[idx].toolbar.tools[-2], plots[idx].toolbar.tools[0]
       
        # nonlocal layout
        layout =gridplot([],[row(plot) for plot in plots])
        handle0 = show(layout,notebook_url = Notebook_Url, notebook_handle = True) 
     
        #Finished the callback
        push_notebook(handle = handle0)

    

    estimators_used_widget = widgets.SelectMultiple(options = Estimator_Names, value = Estimator_Names, description = "Estimators", disabled = False)

    interact_manual(update_plots, Variable1 = [Variable1]+ list(X.columns.values), Variable2 = [Variable2] + list(X.columns.values), y =[y] +list(X.columns.values) , Active_Estimators = estimators_used_widget)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as t_t_s
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture as GM
from sklearn.metrics import accuracy_score as a_s

df = sns.load_dataset('iris')
x = df.drop('species', axis=1)
y = df['species']

xtr, xte, ytr, yte = t_t_s(x, y, test_size=0.25, random_state=0)
print(xtr.shape, yte.shape)  # (112, 4) (38,)
model = GNB()
model.fit(xtr, ytr)
ypred = model.predict(xte)
print("分类准确率:{0:.2%}".format(a_s(yte, ypred)))

# dimensionality reduction
pca = PCA(n_components=2)
new_x = pca.fit_transform(x)
xtr_new, xte_new, ytr_new, yte_new = t_t_s(new_x,
                                           y,
                                           test_size=0.25,
                                           random_state=0)
print(xtr_new.shape, yte_new.shape)  # (112, 2) (38,)
model1 = GNB()
model1.fit(xtr_new, ytr_new)
ypred1 = model1.predict(xte_new)
예제 #25
0
    cross_val_score(model_lr, train_data, label, cv=10, scoring='roc_auc'))

result = model_lr.predict(test_data)
output = pd.DataFrame(data={
    "PassengerId": test["PassengerId"],
    "Survived": result
})
output.to_csv("lr.csv", index=False, quoting=3)

# #### 提交kaggle后准确率:0.78469

# ### 高斯贝叶斯

# In[20]:

model_GNB = GNB()
model_GNB.fit(train_data, label)
print "高斯贝叶斯分类器10折交叉验证得分: ", np.mean(
    cross_val_score(model_GNB, train_data, label, cv=10, scoring='roc_auc'))

result = model_GNB.predict(test_data)
output = pd.DataFrame(data={
    "PassengerId": test["PassengerId"],
    "Survived": result
})
output.to_csv("gnb.csv", index=False, quoting=3)

# #### 提交kaggle后准确率:0.74163

# ### 随机森林
예제 #26
0
def new_gnb():
    args = {
            }
    return GNB(**args)
예제 #27
0
def roc(df, target, model_to_fit, ax):
    from sklearn.model_selection import train_test_split as tts
    from sklearn.metrics import plot_roc_curve

    y = df[target].to_numpy()
    predictors = df.drop(target, axis=1)
    model = model_to_fit

    for col in predictors.columns:
        X = df[col].to_numpy()

        xtrain, xtest, ytrain, ytest = tts(X, y)

        fit = model.fit(xtrain.reshape(-1, 1), ytrain)

        plot_roc_curve(model, xtest.reshape(-1, 1), ytest, ax=ax, label=col)


# using the function with LR, KNN, GNB models on the data set to evaluate their regression capabilities
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import GaussianNB as GNB
fig, ax = plt.subplots(nrows=3,
                       ncols=1,
                       figsize=(18, 18),
                       sharex=True,
                       sharey=True)
roc(data, 'Class', LR(), ax[0])
roc(data, 'Class', KNN(), ax[1])
roc(data, 'Class', GNB(), ax[2])
예제 #28
0
from sklearn.naive_bayes import GaussianNB as GNB
import sklearn.model_selection as skl
from sklearn import preprocessing as ppr
import read_preprocess_data as rpd
from sklearn import metrics as met

# an arbitrary try to Naive Bayes
musicdata = pd.read_csv('sample-data.csv')
le = ppr.LabelEncoder()
# first performing encoding to the dependent and independant columns as it is categorical
#  - unfortunately, the encoding part is not functioning right...
x = musicdata.iloc[:, :-1].values
y = musicdata.iloc[:, -1].values
# x = le.fit_transform(x)
# y = le.fit_transform(y)
# split into tuples of train and test
# x_train, x_test, y_train, y_test = skl.train_test_split(x, y)
# print(x_test)
# print(y_test)

# invoking get split data of our global file, read_preprocess_data
(x_train, y_train), (x_test, y_test) = rpd.getSplitData()
# invoking the Naive Bayes Function
model = GNB()

# fit the training set
model.fit(x_train, y_train)
# predict the testset.
predicted = model.predict(x_test)
print("Model accuracy is : ", met.accuracy_score(y_test, predicted))
예제 #29
0
def main():
    # ----------------------------
    # Training data
    # ----------------------------
    # Loading training data
    trainingDataFile = 'Training_set.csv'
    trainingData = pd.read_csv(trainingDataFile)

    # Obtaining unique cases of events (Note: This remains the same for both training and test data)
    myEventSet = []
    for x in trainingData.events:
        if x not in myEventSet:
            myEventSet.append(x)
    print('Unique events are as follows: \n', myEventSet,'\n')


    # Event string value reassignment based on unique event cases in 'myEventSet'
    newEvents = []
    for x in trainingData.events:
        for i in range(len(myEventSet)):
            if x == myEventSet[i]:
                newEvents.append(i)

    # Converting datetime to Seconds and saving day of the week
    day = []
    numDateTrainData = []
    for i in range(len(trainingData.date)):
        date_obj = datetime.strptime(str(trainingData.date[i]), '%Y-%m-%d')
        numDateTrainData.append(date_obj.timestamp())
        day.append(date_obj.weekday())

    #print(trainingData.date)
    dictReqCount = {}
    for i in range(len(trainingData.date)):
        if day[i] not in dictReqCount.keys():
            dictReqCount[day[i]] = []
        dictReqCount[day[i]].append(trainingData.request_count[i])
    #print(dictReqCount)

    dictAvgReqCount = {}
    for key,val in dictReqCount.items():
        dictAvgReqCount[key] = sum(val)/len(val)
    #print(dictAvgReqCount)

    maxValue = max(dictAvgReqCount.values())
    maxKey = [key for key,val in dictAvgReqCount.items() if val == maxValue]
    print('Day #{} of the week has the max mean request count'.format(maxKey[0]))

    minValue = min(dictAvgReqCount.values())
    minKey = [key for key, val in dictAvgReqCount.items() if val == minValue]
    print('Day #{} of the week has the min mean request count'.format(minKey[0]))


    # Assembling feature arrays
    features_trainingData = []
    for i in range(len(numDateTrainData)):
        row = [numDateTrainData[i], day[i], trainingData.calendar_code[i], trainingData.site_count[i], trainingData.max_temp[i], trainingData.min_temp[i], trainingData.precipitation[i], newEvents[i]];
        features_trainingData.append(row)

    #for i in range(len(features_trainingData)):
    #    print(len(features_trainingData[i]))

    #Y = list(trainingData.request_count)
    Y = trainingData.request_count
    X = features_trainingData

    #print('length of Y =', len(Y))
    #print(features_trainingData)

    # Models that work on both continuous and discrete data
    scoring = 'neg_mean_squared_error'
    models = [DTR(),GNB(),RFR(),KNR()]
    '''models = [[DTR(), DTR(max_depth=2), DTR(max_depth=5)],
              [GNB(), GNB(priors=None)],
              [RFR(), RFR(), RFR()],
              [KNR(), KNR(), KNR()]]
              '''
    seed = 7
    kfold = MS.KFold(n_splits=10, random_state=seed)
    i = 0
    mErr = []
    for model in models:
        results = MS.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
        mErr.append(results.mean())
        i += 1
    #print(mErr)

    best_model_index = 0
    maxAbsErrInd = math.fabs(mErr[0])
    for i in range(1, len(mErr)):
        if (math.fabs(mErr[i]) < maxAbsErrInd):
            best_model_index = i
            maxAbsErrInd = math.fabs(mErr[i])
    print('\nModel #%d (i.e. %s) performed best' %(best_model_index, str(models[best_model_index]).split('(')[0]))

    # -------------------------------------------------------
    # Test Data
    # -------------------------------------------------------
    # Loading test data
    testDataFile = 'Test_set.csv'
    testData = pd.read_csv(testDataFile)

    # Event string reassignment using myEventSet from training data
    newEvents = []
    for x in testData.events:
        for i in range(len(myEventSet)):
            if x == myEventSet[i]:
                newEvents.append(i)

    # Converting datetime to Seconds and determining days of the week
    day = []
    numDateTestData = []
    for i in range(len(testData.date)):
        date_obj = datetime.strptime(str(testData.date[i]), '%Y-%m-%d')
        numDateTestData.append(date_obj.timestamp())
        day.append(date_obj.weekday())

    # Assembling feature arrays
    features_testData = []
    for i in range(len(numDateTestData)):
        row = [numDateTestData[i], day[i], testData.calendar_code[i], testData.site_count[i], testData.max_temp[i],
               testData.min_temp[i], testData.precipitation[i], newEvents[i]];
        features_testData.append(row)

    # Test data features
    X_test = features_testData

    # Test data prediction
    bestModel = models[best_model_index]
    Y_pred = bestModel.fit(X, Y).predict(X_test)
    Y_pred_train = bestModel.fit(X, Y).predict(X)
    print('\nThe predicted values for request count using the test data is as follows:\n',Y_pred)

    output_file = open('predicted_request_count.csv','w')
    for i in range(len(Y_pred)):
        output_file.write(str(Y_pred[i])+'\n')
    output_file.close()

    # Plot the results
    plt.figure(1)
    plt.scatter(numDateTrainData, Y, c="darkorange", label="Training data")
    plt.scatter(numDateTestData, Y_pred, c="cornflowerblue", label="Test data model prediction")
    plt.scatter(numDateTrainData, Y_pred_train, c="red", label="Training data model prediction")
    plt.xlabel("Numerical Date")
    plt.ylabel("Page Count")
    plt.title("Best Model")
    plt.legend()
    plt.show()
          " " * 4,
          _Grid_Result.best_params_,
          "\n",
          sep="",
          end="\n")
    print()
    return _Grid_Result


################################################################################
# 分类算法审查
_ORDINARY_MODELS = {
    "LR": LR(),
    "LDA": LDA(),
    "KNC": KNC(),
    "GNB": GNB(),
    "DTC": DTC(),
    "SVC": SVC()
}
_ORDINARY_ALGORITHM_CMP_RESULTS = _Models_Cmp(
    _Models=_ORDINARY_MODELS,
    _Figure_Title="ALGORITHM COMPARISON")  # Best: KNC
_Model_Run(_Model=_ORDINARY_MODELS["KNC"], _Report_Title="KNC-K近邻算法")
################################################################################
# 数据正态化后重审
_SCALED_MODELS = {
    "LR":
    pipeline.Pipeline([("Scaler", preprocessing.StandardScaler()),
                       ("LR", LR())]),
    "LDA":
    pipeline.Pipeline([("Scaler", preprocessing.StandardScaler()),