def subsetter(data, column, stopwords): """Use CountVectorizer to actually measure the prediction ability using Naive Bayes""" texttotokens = [] flatlist = [] tokenizer = RegexpTokenizer(r'[A-Za-z]+') for i in data[column]: i = i texttotokens.append(i) for y in i: flatlist.append(y) frequency_dist = nltk.FreqDist([word for word in tokenizer.tokenize(str(flatlist)) \ if word not in stopwords]) top50n = sorted(frequency_dist, key=frequency_dist.__getitem__, reverse=True)[0:50] X_train, X_test, y_train, y_test = train_test_split(data['Reviews'].values,data['naive_bayes'].values, test_size=0.2,random_state=1) REGEX = re.compile(r",\s*") tokenize = [tok.strip().lower() for tok in REGEX.split(str(stopwords))] cv = CountVectorizer(lowercase=True, stop_words='english', binary=True) X_train_cv = cv.fit_transform(X_train) naive_bayes = BernoulliNB() naive_bayes.fit(X_train_cv, y_train) X_test_cv = cv.transform(X_test) predictions = naive_bayes.predict(X_test_cv) print('Accuracy score: ', accuracy_score(y_test, predictions)) print(sum(y_test == predictions) / len(predictions), "/n") print('Precision score: ', precision_score(y_test, predictions)) print(sum(y_test[predictions == 1] == 1) / len(y_test[predictions == 1]), "/n") print('Recall score: ', recall_score(y_test, predictions)) print(sum(predictions[y_test == 1] == 1) / len(predictions[y_test == 1] == 1), "/n")
roc_auc_score1 = roc_auc_score(yten_test,pred_label) print("ROC AUC: {0}".format(roc_auc_score1)) # ### Naive Bayes # In[758]: from sklearn import naive_bayes as naive_b # In[759]: #Fitting the model naive_b = GaussianNB() naive_b.fit(Xall_train,yall_train) # In[760]: #Predicting y values pred_label = naive_b.predict(Xall_test) # In[761]: from sklearn.metrics import confusion_matrix # In[762]:
'min_samples_split': [2, 5, 10] } dt_grid_estimator = model_selection.GridSearchCV(dt_estimator, dt_grid, scoring='accuracy', cv=10, refit=True, return_train_score=True) dt_grid_estimator.fit(training_data, y_train) result = dt_grid_estimator.cv_results_ ####### #multiNBClassifier = MultinomialNB().fit(X_train_tfidf, y_train) naive_bayes = MultinomialNB() naive_bayes_text_classifier = naive_bayes.fit((training_data), y_train) print(naive_bayes_text_classifier.score(training_data, y_train)) #0.7658017298735862 print(naive_bayes_text_classifier.score(testing_data, y_test)) #0.7325349301397206 ##Generate Prediction predictions = naive_bayes_text_classifier.predict(testing_data) predictions #########Ensemble Random Forest############### rf_estimator = ensemble.RandomForestClassifier(random_state=100) rf_estimator_clf = rf_estimator.fit(training_data, y_train) print(result) print(result.get('params')) print(result.get('mean_train_score')) print(result.get('mean_test_score'))
encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=twokenize.tokenizeRawTweetText, vocabulary=None) trainx = cv.fit_transform(trainx) testx = cv.transform(testx) naive_bayes = MultinomialNB().map(lambda x: twokenize.tokenizeRawTweetText(x)) corpus = df + dft naive_bayes.fit(trainx, trainy) predictions = naive_bayes.predict(testx) #%% print("Accuracy score: ", accuracy_score(testy, predictions)) print("Precision score: ", precision_score(testy, predictions)) print("Recall score: ", recall_score(testy, predictions)) #%% svm = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto') svm.fit(trainx, trainy) predictions_svm = svm.predict(testx) print("svm Accuracy Score -> ", accuracy_score(predictions_svm, testy) * 100)
# compare with naive bayes as a baseline model nb = nb.BernoulliNB() models = [lr, nb] scores = crossValidation(X_train, y_train, models) # plot error for model LR plotErrorCurve(X_train, y_train, lr) # signifance test ttest = ttest_across_folds(scores[0], scores[1]) print("T-test score: {}".format(ttest)) # McNemar's test lr.fit(X_train, y_train) lrYPred = lr.predict(X_test) nb.fit(X_train, y_train) nbYPred = nb.predict(X_test) lr_yn = y_test == lrYPred nb_yn = y_test == nbYPred print("Logistic Regression accuracy: {}".format( np.sum(lr_yn) / len(y_test))) print("Naive Bayes accuracy: {}".format(np.sum(nb_yn) / len(y_test))) cmp = mcnemar(lr_yn, nb_yn) print("McNemar's test score: {}".format(cmp)) # plot the lr confusion matrix pl.figure() pl.matshow(met.confusion_matrix(y_test, lrYPred), interpolation='nearest') pl.colorbar() pl.ylabel('true label')