def bernoulli_naive_bayes(x_train, y_train, x_cv, y_cv): """ Using Naive Bayes to classify the data. """ print 'Training with NB...' clf = BernoulliNB() clf.fit(x_train, y_train) print 'Accuracy in training set: %f' % clf.score(x_train, y_train) print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv) return clf
def classify(opts, data_train, data_test, labels_train, labels_test): # ##### TRAIN THE MODEL ###################################### # Initialize the corresponding type of the classifier and train it (using 'fit') if opts.classifier == 'nb': classifier = BernoulliNB(binarize=None) elif opts.classifier == 'lr': classifier = LinearRegression() elif opts.classifier == 'log': classifier = LogisticRegression() elif opts.classifier == 'svm': classifier = LinearSVC() else: raise Exception('Unrecognized classifier!') classifier.fit(data_train, labels_train) #all np # ############################################################ # ###### VALIDATE THE MODEL ################################## # Print training mean accuracy accuracy = classifier.score(data_train, labels_train) if opts.verbose: print "accuracy = ", accuracy # Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy' # and print the mean score and std deviation # cv = cross_validation.KFold() if opts.classifier != 'lr': cross_val_scores = cross_validation.cross_val_score(classifier, data_train, labels_train, scoring='accuracy', cv=10) if opts.verbose: print "cross val mean = ", cross_val_scores.mean() print "cross val stdev = ", cross_val_scores.std() # ############################################################ test_accuracy = classifier.score(data_test, labels_test) if opts.verbose: print "test accuracy = ", test_accuracy # Predict labels for the test set labels_predicted = classifier.predict(data_test) labels_predicted = np.round(labels_predicted, 2) #round to hundredths place for readability # print "***************" if opts.verbose: print "actual labels:\n", labels_test print "predicted labels:\n", labels_predicted return [accuracy, test_accuracy]
def compareClassifiers(): (observations, classes) = createObservations() observations = np.array(observations) classes = np.array(classes) # make tree classifier my_tree = tree.DecisionTreeClassifier() my_tree.fit(observations, classes) tree_score = my_tree.score(observations, classes) tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10) #print "tree score:", tree_score, "tree cv", np.mean(tree_cv) # make naive classifier naive = BernoulliNB(binarize=None) naive.fit(observations, classes) naive_score = naive.score(observations, classes) naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10) #print "naive score:", naive_score, "naive cv", np.mean(naive_cv) # make SVM classifier svm = LinearSVC() svm.fit(observations, classes) svm_score = svm.score(observations, classes) svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10) #print "svm score:", svm_score, "svm cv", np.mean(svm_cv) # make Log classifier log = LogisticRegression() log.fit(observations, classes) log_score = log.score(observations, classes) log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10) #print "log score:", log_score, "log cv", np.mean(log_cv) return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]
def evaluateSubjectivity(k, tokenizer: Tokenizer, alphas): count_vectorizer = CountVectorizer(tokenizer=tokenizer) objective_data_stream = stream_subjectivity_documents(PATH_TO_SUBJECTIVITY_DATA_OBJECTIVE, Labels.strong_pos) subjective_data_stream = stream_subjectivity_documents(PATH_TO_SUBJECTIVITY_DATA_SUBJECTIVE, Labels.strong_neg) X_objective_data, y_obj_labels = zip(*objective_data_stream) X_subjective_data, y_subj_labels = zip(*subjective_data_stream) X_objective_train_data = X_objective_data[:4000] y_obj_train_labels = y_obj_labels[:4000] X_subjective_train_data = X_subjective_data[:4000] y_subj_train_labels = y_subj_labels[:4000] X_objective_test_data = X_objective_data[4000:] y_obj_test_labels = y_obj_labels[4000:] X_subjective_test_data = X_subjective_data[4000:] y_subj_test_labels = y_subj_labels[4000:] # get vector counts X_train_counts = count_vectorizer.fit_transform(X_objective_train_data + X_subjective_train_data) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) accuracies = [] for alpha in alphas: classifier = BernoulliNB(alpha=alpha) classifier.fit(X_train_tfidf, y_obj_train_labels + y_subj_train_labels) X_test_counts = count_vectorizer.transform(X_objective_test_data + X_subjective_test_data) score = classifier.score(X_test_counts, y_obj_test_labels + y_subj_test_labels) accuracies.append(score) return accuracies
def nb(): from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB() clf.fit(Xtrain, Ytrain_labels) BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) print(clf.predict(Xtrain[2:300])) print(clf.score(Xtest, Ytest_labels))
def _bernoulli_NB(self): clf = BernoulliNB() clf.fit(self.X_train, self.y_train) score = clf.score(self.X_test, self.y_test) print('Accuracy rate of Naive Bayes: {0:.3f}'.format(score)) y_pred = clf.predict_proba(self.X_test) ks(y_pred.T[0], self.y_test)
def getscores(target, data): ys = target.tolist() ys.sort() t = np.zeros(len(target)) neg = ys[19] pos = ys[-20] for i in range(len(t)): if target[i] <= neg: t[i] = 0 elif target[i] >= pos: t[i] = 2 else: t[i] = 1 dn = [] for i in range(len(t)): if t[i] == 1: dn += [i] dn = np.array(dn) y = np.delete(t, dn, 0) x = np.delete(data, dn, 0) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) clf = BernoulliNB() clf.fit(x_train, y_train) scores = clf.score(x_test, y_test) return (scores)
def render_content(self): if self.text_source is None: return "No text source selected." from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import BernoulliNB from sklearn import metrics self.dm("creating vectorizer") vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size) data = self.get_column_data(self.text_source) self.dm("using vectorizer") X_train = vectorizer.fit_transform(data) Y_train = self.get_column_data(self.code_source) self.dm("creating classifier") clf = BernoulliNB() clf.fit(X_train, Y_train) accuracy = clf.score(X_train, Y_train) self.dm("predicting") pred = clf.predict(X_train) cm = metrics.confusion_matrix(Y_train, pred) self.dm("displaying result") html_output = "accuracy is " + str(round(accuracy, 2)) html_output += '<pre>'+ str(cm) + '</pre>' return html_output
def evaluateIMDB(k, tokenizer: Tokenizer, alphas): count_vectorizer = CountVectorizer(tokenizer=tokenizer) train_pos_path = os.path.join(PATH_TO_IMDB_TRAIN_DATA, POS_LABEL) train_neg_path = os.path.join(PATH_TO_IMDB_TRAIN_DATA, NEG_LABEL) train_pos_data_stream = stream_documents(Labels.strong_pos, train_pos_path, os.listdir(train_pos_path)) train_neg_data_stream = stream_documents(Labels.strong_neg, train_neg_path, os.listdir(train_neg_path)) X_pos_train_data, y_pos_train_labels = zip(*train_pos_data_stream) X_neg_train_data, y_neg_train_labels = zip(*train_neg_data_stream) test_pos_path = os.path.join(PATH_TO_IMDB_TEST_DATA, POS_LABEL) test_neg_path = os.path.join(PATH_TO_IMDB_TEST_DATA, NEG_LABEL) test_pos_data_stream = stream_documents(Labels.strong_pos, test_pos_path, os.listdir(test_pos_path)) test_neg_data_stream = stream_documents(Labels.strong_neg, test_neg_path, os.listdir(test_neg_path)) X_pos_test_data, y_pos_test_labels = zip(*test_pos_data_stream) X_neg_test_data, y_neg_test_labels = zip(*test_neg_data_stream) # get vector counts X_train_counts = count_vectorizer.fit_transform(X_neg_train_data + X_pos_train_data) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) accuracies = [] for alpha in alphas: classifier = BernoulliNB(alpha=alpha) classifier.fit(X_train_tfidf, y_neg_train_labels + y_pos_train_labels) X_test_counts = count_vectorizer.transform(X_pos_test_data + X_neg_test_data) score = classifier.score(X_test_counts, y_pos_test_labels + y_neg_test_labels) accuracies.append(score) return accuracies
def test_BernoulliNB(): clf = BernoulliNB() for ptype in range(1, 8): train_set, test_set, fea_lst = get_dataframe(ptype) clf.fit(train_set[fea_lst], train_set['tag']) s = clf.score(test_set[fea_lst], test_set['tag']) print('ptype:', ptype, ' score:', s)
def train(): data, labels = preprocessing() train_data, test_data, y_train, y_test = split_data(data, labels) vectorizer = CountVectorizer(max_df=0.5, min_df=1, stop_words=None) X_train = vectorizer.fit_transform(train_data) X_test = vectorizer.transform(test_data) X = vectorizer.transform(data) # print(vectorizer.get_feature_names()) NBclf = MultinomialNB() NBclf.fit(X_train, y_train) print("多项式贝叶斯分类器交叉验证得分: ", NBclf.score(X_test, y_test)) print("多项式贝叶斯分类器准确率: ", accuracy_score(labels, NBclf.predict(X))) BNBclf = BernoulliNB() BNBclf.fit(X_train, y_train) print("伯努利贝叶斯分类器交叉验证得分: ", BNBclf.score(X_test, y_test)) print("伯努利贝叶斯分类器准确率: ", accuracy_score(labels, BNBclf.predict(X))) LRclf = LogisticRegression() LRclf.fit(X_train, y_train) print("罗吉斯回归分类器交叉验证得分: ", LRclf.score(X_test, y_test)) print("罗吉斯回归二分类器准确率: ", accuracy_score(labels, LRclf.predict(X))) SVMclf = svm.SVC() SVMclf.fit(X_train, y_train) print("支持向量机分类器交叉验证得分: ", SVMclf.score(X_test, y_test)) print("支持向量机二分类器准确率: ", accuracy_score(labels, SVMclf.predict(X)))
def BernoulliNB_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Bernoulli Nive Bayes...") bernNB = BernoulliNB(alpha=0.7) bernNB.fit(train, train_labels) prediction = bernNB.predict(test) utils.report_and_confmat(test_labels, prediction, "BernoulliNB") score = bernNB.score(test, test_labels) res["BernoulliNB"] = { "model": bernNB, "accuracy": score, "name": "BernoulliNB" } print("Bernoulli ended...") return score, bernNB
def train_idf(): data, labels = preprocessing() train_data, test_data, y_train, y_test = split_data(data, labels) # stop_words = [line.strip() for line in open("stop_words/stop_words.txt", "r", encoding="utf-8")] vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, stop_words=None, use_idf=True) X_train = vectorizer.fit_transform(train_data) X_test = vectorizer.transform(test_data) X = vectorizer.transform(data) NBclf = MultinomialNB() NBclf.fit(X_train, y_train) print("多项式贝叶斯分类器交叉验证得分: ", NBclf.score(X_test, y_test)) print("多项式贝叶斯分类器准确率: ", accuracy_score(labels, NBclf.predict(X))) BNBclf = BernoulliNB() BNBclf.fit(X_train, y_train) print("伯努利贝叶斯分类器交叉验证得分: ", BNBclf.score(X_test, y_test)) print("伯努利贝叶斯分类器准确率: ", accuracy_score(labels, BNBclf.predict(X))) LRclf = LogisticRegression() LRclf.fit(X_train, y_train) print("罗吉斯回归分类器交叉验证得分: ", LRclf.score(X_test, y_test)) print("罗吉斯回归二分类器准确率: ", accuracy_score(labels, LRclf.predict(X))) SVMclf = svm.SVC() SVMclf.fit(X_train, y_train) print("支持向量机分类器交叉验证得分: ", SVMclf.score(X_test, y_test)) print("支持向量机二分类器准确率: ", accuracy_score(labels, SVMclf.predict(X)))
class Model(object): def __init__(self): # self.model = GradientBoostingClassifier(learning_rate=0.01, max_depth=8, # max_features=5, min_samples_leaf=5, n_estimators=1500) self.model = BernoulliNB(alpha=1) self.tfidf = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', lowercase=True) pass def fit(self, X, y): # Import X and y as text X = self.tfidf.fit_transform(X) y = y self.model.fit(X, y) filename = 'data/model.pkl' pickle.dump(self, open(filename, 'wb')) return self def predict(self, X): X = self.tfidf.transform(X) predictions = self.model.predict(X) return predictions def predict_proba(self, X): X = self.tfidf.transform(X) proba_predictions = self.model.predict_proba(X) return proba_predictions def score(self, X, y): X = self.tfidf.transform(X) score = self.model.score(X, y) return score
class BernoulliNaiveBayesClassifier: def __init__(self, x_train, y_train): self._x_train = x_train self._y_train = y_train self._bernoulli_naive_bayes = BernoulliNB() def train(self): self._bernoulli_naive_bayes.fit(self._x_train, self._y_train) def test(self, x_test): return self._bernoulli_naive_bayes.predict(x_test) def accuracy(self, x_test, y_test): return self._bernoulli_naive_bayes.score(x_test, y_test) def get_average_f1_score(self, x_test, y_test): labels = [1, 0, -1] y_pred = self._bernoulli_naive_bayes.predict(x_test) # Save predicted labels project_relative_path = os.path.dirname( os.path.dirname(os.path.dirname(__file__))) print(project_relative_path) output_file_sentiment_label = open( os.path.join(project_relative_path, 'saved_model_data/naive_bayes_labels.txt'), 'a') for label in y_pred: output_file_sentiment_label.write(str(label)) output_file_sentiment_label.write('\n') return f1_score(y_test, y_pred, average='weighted', labels=labels)
def bnb(self): from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import classification_report, roc_auc_score bnb = BernoulliNB() bnb.fit(self.X_train, self.y_train) y_hat_train = bnb.predict(self.X_train) y_hat_test = bnb.predict(self.X_test) acc_bnb = round(bnb.score(self.X_test, self.y_test) * 100, 2) print('Model Accuracy: ', acc_bnb) print('Naive Bayes:\n 1. train 2. test') print( classification_report(self.y_train, y_hat_train), classification_report(self.y_test, y_hat_test), sep='\n-------------------------------------------------------\n') y_score = bnb.predict_proba(self.X_test) print( 'ovo', roc_auc_score(self.y_test, y_score, multi_class='ovo'), 'ovr', roc_auc_score(self.y_test, y_score, multi_class='ovr'), sep='\n-------------------------------------------------------\n')
def tryBinomialNaiveBayes(goFast): best_score = 0 from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm") validation_data, validation_labels = load_svmlight_file("dt1.vld.svm") testing_data, testing_labels = load_svmlight_file("dt1.tst.svm") from sklearn.naive_bayes import BernoulliNB for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: for fit_prior_value in [True, False]: binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value) binary_operator.fit(training_data,training_labels) current_score = binary_operator.score(validation_data,validation_labels) print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value print "Current score: " + str(current_score) if current_score > best_score: best_score = current_score print "***NEW MAXIMUM SCORE: " + str(best_score) print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value print "Best score was " + str(best_score)
def doclassify(self, type='normal'): if type == 'normal': clf = BernoulliNB() clf.fit(self.train_x, self.train_y) BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) score = clf.score(self.train_x, self.train_y) print 'score = ', score
def registerInitialState(self, state): # open datafile, extract content into an array, and close. self.datafile = open('good-moves.txt', 'r') content = self.datafile.readlines() self.datafile.close() # Now extract data, which is in the form of strings, into an # array of numbers, and separate into matched data and target # variables. self.data = [] self.target = [] # Turn content into nested lists for i in range(len(content)): lineAsArray = self.convertToArray(content[i]) dataline = [] for j in range(len(lineAsArray) - 1): dataline.append(lineAsArray[j]) self.data.append(dataline) targetIndex = len(lineAsArray) - 1 self.target.append(lineAsArray[targetIndex]) # data and target are both arrays of arbitrary length. # # data is an array of arrays of integers (0 or 1) indicating state. # # target is an array of integers 0-3 indicating the action # taken in that state. # ============================================================================= # Start: Running the classifier # ============================================================================= # Train test split with 0.2 for my own classifier. This code will run the classifier and test it, returning the score of that split. self.split_score = self.train_test_splitter(self.data, self.target, 0.2) # Custom built cross-validation score for my NBayes classifier self.cross_val_score = self.k_fold(self.data, self.target, 10) # Learning built classifier with all the data self.probabilities, self.prior = self.Naive_Bayes_Train( self.data, self.target) self.score = 0 # This allows us to only print the metrics once (see getAction) # Calculating training score self.training_score, self.matrix = self.Bayes_score( self.data, self.target) # Using scikit learn metrics to compare algorithms clf = BernoulliNB().fit(self.data, self.target) self.scikit_score = clf.score(self.data, self.target) self.scikit_cross_val = cross_val_score(clf, self.data, self.target, cv=10).mean() self.scikit_matrix = confusion_matrix(self.target, clf.predict(self.data))
def naive_bayes(X_train, Y_train, X_test, Y_test): # classifier = MultinomialNB() classifier = BernoulliNB() classifier.fit(X_train, Y_train) print("accuracy score of naive bayes") print(classifier.score(X_test, Y_test)) filename = './naive_bayes_glove.sav' pickle.dump(classifier, open(filename, 'wb'))
def bernoulli_nb(X_train, y_train, X_test, y_test): alpha_values = [0.0000000001, 0.000001, 0.0001, 0.1, 0.2, 0.5, 0.8, 1.0, 1.5, 2.0] for a in alpha_values: bnb = BernoulliNB(alpha=a) bnb.fit(X_train, y_train) score = bnb.score(X_test, y_test) print('Score bnb(alpha='+str(a)+'): ' + str(score))
def cross_validate(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) clf = BernoulliNB().fit(X_train, y_train) return (clf.score(X_test, y_test) * 100)
def BNB(X_train, y_train, X_test, y_test, weights={0: 1, 1: 1}, alpha = 1.0, folder = "bush_models"): bnb = BernoulliNB(alpha = alpha) bnb = bnb.fit(X_train, y_train) joblib.dump(bnb, folder+"/"+str(alpha)+'_bnb.joblib') print(bnb.score(X_test, y_test))
def predict_NB_Bernoulli(X, Y): X_dev, X_test, y_dev, y_test = train_test_split(X, Y, test_size=0.15, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.15 / 0.85, random_state=42) clf = BernoulliNB() clf.fit(X_train, y_train) print("Accuracy: " + str(clf.score(X_val, y_val))) prob_X = clf.predict_proba(X_val) prob_score = 0 for i in range(len(y_val)): prob_score += prob_X[i][y_val[i]] print("Average prob for correct classes: " + str(prob_score / len(y_val))) return clf.score(X_val, y_val)
def ml_algo(inp): df = pd.read_csv("data/final_preprocess.csv") X = np.array(df.drop(['Result'], axis=1)) y = np.array(df['Result']) X, y = shuffle(X, y, random_state=1) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) model_centroid = NearestCentroid().fit(X_train, y_train) model_knn = KNeighborsClassifier(25).fit(X_train, y_train) model_svm = SVC().fit(X_train, y_train) model_lr = LinearRegression().fit(X_train, y_train) model_nb = BernoulliNB().fit(X_train, y_train) # criterion-> gini or entropy; splitter-> best or random; max_depth-> any integer value or None; # min_samples_split-> min no. of samples reqd. to split an internal node; # min_samples_leaf -> The minimum number of samples required to be at a leaf node. # min_impurity_split -> It defines the threshold for early stopping tree growth. model_dtree = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5).fit( X_train, y_train) # print ("[1] ACCURACY OF DIFFERENT MODELS ",'\n___________________') accu_centroid = model_centroid.score(X_test, y_test) # print ("NearestCentroid -> ", accu_centroid) accu_knn = model_knn.score(X_test, y_test) # print ("Knn -> ",accu_knn) accu_svm = model_svm.score(X_test, y_test) # print ("SVM -> ", accu_svm,) accu_lr = model_lr.score(X_test, y_test) # print ("Linear Regr -> ", accu_lr) accu_nb = model_nb.score(X_test, y_test) # print ("Naive Bayes -> ", accu_nb) accu_dtree = model_dtree.score(X_test, y_test) # print ("Decission Tree -> ", accu_dtree, "\n") result_centroid = model_centroid.predict(inp) result_knn = model_knn.predict(inp) result_svm = model_svm.predict(inp) result_lr = model_lr.predict(inp) result_nb = model_nb.predict(inp) result_dtree = model_dtree.predict(inp) # disease-name, description, [list of step to be taken], [list of to whom we can contact] # print ("[2] PREDICTION ",'\n___________________') # print ("NearestCentroid -> ", result_centroid) # print ("knn -> ", result_centroid) # print ("svm -> ", result_svm) # print ("LinearReg -> ", result_lr) # print ("Naive Bayes -> ", result_nb) # print ("Decission Tree -> ", result_dtree) # return map_disease[str(result_knn[0])] return result_knn[0]
def getBernoulliNaiveBayesPredictions(bestAlpha, X_train, y_train, X_test, y_test): model = BernoulliNB(alpha=bestAlpha) model.fit(X_train, y_train) print(model) y_pred = model.predict(X_test) return y_pred, model.score(X_test, y_test)
def BNB(train_x, train_y, test_x, test_y): #BernoulliNB알고리즘 결과출력 bnb = BernoulliNB() bnb.fit(train_x, train_y) pre_arr = bnb.predict(test_x) pre_arr = pre_arr.reshape(10, 12) print('BernoulliNB의 테스트 세트 예측 :\n{}'.format(pre_arr)) print('BernoulliNB의 테스트 세트 정확도 : {0:0.2f}%'.format( bnb.score(test_x, test_y) * 100)) print('------------------------------------------------------')
class TextClassifier(object): """A text classifier model: - Vectorize the raw text into features. - Fit a naive bayes model to the resulting features. """ def __init__(self): #self._vectorizer = TfidfVectorizer(stop_words='english') self._vectorizer = CountVectorizer() self._classifier = BernoulliNB() #self._classifier = MultinomialNB() def fit(self, X, y): """Fit a text classifier model. Parameters ---------- X: A numpy array or list of text fragments, to be used as predictors. y: A numpy array or python list of labels, to be used as responses. Returns ------- self: The fit model object. """ # Code to fit the model. train_stuff = self._vectorizer.fit_transform(X, y) self._classifier.fit(train_stuff, y=y) return self def predict_proba(self, X): """Make probability predictions on new data.""" stuff = self._vectorizer.transform(X) result = self._classifier.predict_proba(stuff) return result pass def predict(self, X): """Make predictions on new data.""" stuff = self._vectorizer.transform(X) result = self._classifier.predict(stuff) return result pass def score(self, X, y): """Return a classification accuracy score on new data.""" stuff = self._vectorizer.transform(X) result = self._classifier.score(stuff, y) return result pass
def main(): start_time = time.time() #read in game IDs games_data = pd.read_csv('games-data.csv') all_games = np.array(games_data['game_id']) all_plyrs = np.array(games_data['plyr_id']) uni_game_ids = np.unique(all_games) #read in player IDs player_data = pd.read_csv('players.csv') plyr_ids = np.unique(np.array(player_data['ID'])) #read in fantasy scores fantasy_scores = pd.read_csv('fantasy_scores.csv') #gets player training matrix plyr_id = 8439 X = create_training_set(plyr_id, games_data, plyr_ids) index = get_ninety_percent(len(np.array(X.index))) #for cross-validation train_X = X[:index] test_X = X[index:] #gets training output vector plyr_game_ids = np.array(train_X.index) scores = plyr_fantasy_pts(plyr_id, plyr_game_ids, fantasy_scores) Y = discretize(scores.values) train_Y = Y[:index] test_Y = Y[index:] #run Bernoulli NB Classifier nb_clf = BernoulliNB() nb_clf.fit(train_X, train_Y) nb_predictions = nb_clf.predict(test_X) #run Multinomial NB Classifier mn_clf = MultinomialNB() mn_clf.fit(train_X, train_Y) mn_predictions = nb_clf.predict(test_X) #test for game, fantasy score alignment for i in xrange(test_Y.shape[0]): print plyr_game_ids[i], scores.values[i], test_Y[i], nb_predictions[i], mn_predictions[i] print "Bernoulli NB accuracy: ", nb_clf.score(test_X, test_Y) print "Bernoulli NB prob estimates: ", nb_clf.predict_proba(test_X) print "Multinomial NB accuracy: ", mn_clf.score(test_X, test_Y) print "Bernoulli NB prob estimates: ", mn_clf.predict_proba(test_X) print len(nb_clf.predict_proba(test_X)[0]) nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) vals = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5] ev = expected_val(nb_norm_prob, vals) print "EV: ", ev end_time = time.time() print("Elapsed time was %g seconds" % (end_time - start_time))
def trainBernoulliNB(X,y,loadweights): print("Training BernoulliNB...") BN_classifier = BernoulliNB() if loadweights: with open('weights/BernoulliNB.pickle', 'rb') as handle: BN_classifier = pickle.load(handle) for _ in range(10): BN_classifier.partial_fit(X,y,classes=[0,1]) with open('weights/BernoulliNB.pickle', 'wb') as handle: pickle.dump(BN_classifier, handle, protocol=pickle.HIGHEST_PROTOCOL) print (BN_classifier.score(X,y))
def train_classifier(self): # Get list of features # count_vect = CountVectorizer(min_df=3, max_df=0.90) # X_CV = count_vect.fit_transform(x_train) # print number of unique words (n_features) # print ("Shape of train data is "+str(X_CV.shape)) # tfidf transformation### # tfidf_transformer = TfidfTransformer(use_idf = _use_idf) # X_tfidf = tfidf_transformer.fit_transform(X_CV) # train the classifier print ("Fitting data ...") clf = BernoulliNB().fit(x_train, y_train) ################## # get cross validation score ################## scores = cross_val_score(clf, x_train, y_train, cv=10, scoring='f1_weighted') print ("Cross validation score: "+str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation # the factor two is to signify 2 sigma, which is 95% confidence level print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################## # run classifier on test data ################## # X_test_CV = count_vect.transform(x_test) # # print ("Shape of test data is "+str(X_test_CV.shape)) # # X_test_tfidf = tfidf_transformer.transform(X_test_CV) y_predicted = clf.predict(x_test) # print the mean accuracy on the given test data and labels print ("Classifier score on test data is: %0.2f " % clf.score(x_test,y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf
def main(X_data, y_data, test_size): X_train, X_test, y_train, y_test = cross_validation.train_test_split( X_data, y_data, test_size=(1 - test_size)) X_train = X_train.toarray() # cria o classificador gnb = BernoulliNB() gnb.fit(X_train, y_train) # mostra o resultado do classificador na base de teste return gnb.score(X_test, y_test)
class SKLearnBernoulliNB(ClassificationModel): def __init__(self, alpha=1.0): self.bernoulli_nb = BernoulliNB(alpha=alpha) def train(self, data, labels): self.bernoulli_nb.fit(data, labels) def score(self, data, labels): return self.bernoulli_nb.score(data, labels) def predict(self, data): return self.bernoulli_nb.predict(data)
def BNB(alphas): bnb_one = BernoulliNB(binarize=0.5) bnb_one.fit(train_data, train_labels) print( "\n\nBernoulli Naive Bayes accuracy when alpha = 1 (the default value):", bnb_one.score(dev_data, dev_labels)) bnb_zero = BernoulliNB(binarize=0.5, alpha=0) bnb_zero.fit(train_data, train_labels) print("BNB accuracy when alpha = 0:", bnb_zero.score(dev_data, dev_labels)) bnb = BernoulliNB(binarize=0.5) clf = GridSearchCV(bnb, param_grid=alphas) clf.fit(train_data, train_labels) print("Best parameter for BNB on the dev data:", clf.best_params_) clf_tuned = BernoulliNB(binarize=0.5, alpha=0.00000000000000000000001) clf_tuned.fit(train_data, train_labels) print("Accuracy using the tuned Laplace smoothing parameter:", clf_tuned.score(dev_data, dev_labels), "\n\n")
def performBernoulli(self, alpha=1.0): t0 = time.time() clf = BernoulliNB(alpha=alpha) clf.fit(self.X_train, self.Y_train) print("Time taken to Train: %s seconds ---" % (time.time() - t0)) t0 = time.time() accuracy = clf.score(self.X_test, self.Y_test) print("Time taken to Tests: %s seconds ---" % (time.time() - t0)) print "Accuracy : %s" % accuracy return accuracy
def linear_svm(training_data, testing_data, training_target, testing_target): start = time() clf_linear = BernoulliNB() clf_linear.fit(training_data, training_target) predict_test = clf_linear.predict_proba(testing_data)[:,1] print(predict_test[:30]) print(testing_target[:30]) result = roc_auc_score(testing_target, predict_test) #result = f1_score(testing_target, predict_test,labels=[0,1,2], average='micro') end = time() print("Training time: {}".format(end - start)) print("mean accuracy:{}".format(clf_linear.score(testing_data, testing_target))) return result
def predict_scores(markers, threshold=0.05): scores = [] for i, marker in enumerate(markers): try: bnb = BNB() bnb.fit(marker["individuals"], marker["population_labels"]) scores.append((bnb.score(marker["individuals"], marker["population_labels"]), i)) except: scores.append((0.0, i)) scores.sort() scores.reverse() cutoff_idx = int(threshold * len(scores)) return scores[:cutoff_idx]
def do_TRT(ne = 10, md = 3): from sklearn.ensemble import RandomTreesEmbedding from sklearn.naive_bayes import BernoulliNB train_X, train_Y, test_X, test_Y = analysis_glass() all_X = np.vstack((train_X, test_X)) hasher = RandomTreesEmbedding(n_estimators=ne,\ random_state=0, max_depth=md) all_X_trans = hasher.fit_transform(all_X) train_X_trans = all_X[0:149, :] test_X_trans = all_X[149:, :] nb = BernoulliNB() nb.fit(train_X_trans, train_Y) return nb.score(test_X_trans, test_Y)
def compare_sklearn(self): ''' compares our implementation to sklearn's implementation. assumes that evaluate_accuracy has been called. ''' if not self.accuracy_tested: raise 'you must test the accuracy of the classifier before comparing to sklearn' print "--> Checking sklearn's accuracy..." X = np.array(self.np_reps) nb = BernoulliNB(alpha=0) y = np.array(self.gold) nb.fit(X,y) print "...done." print "sklearn accuracy is %f. Our accuracy was %f. " % (nb.score(X,y), self.accuracy)
def plot_scores(markers, flname): plt.clf() scores = [] for i, marker in enumerate(markers): try: mnb = BNB() mnb.fit(marker["individuals"], marker["population_labels"]) scores.append(mnb.score(marker["individuals"], marker["population_labels"])) except: scores.append(0.0) plt.hist(scores, bins=np.arange(0.0, 1.0, 0.01)) plt.xlabel("Score", fontsize=18) plt.ylabel("Occurrences", fontsize=18) plt.savefig(flname, DPI=200)
def bnb(training_data, training_target, testing_data, testing_target): """ DESCRIPTION: INPUTS: OUTPUTS: EXAMPLE USAGE: """ clf = BernoulliNB() clf.fit(training_data, training_target) return clf.score(testing_data, testing_target)
def BernoulliNaiveBayes(listOfTrainComments, listOfTestComments, listOfUniqueTokens): xTrain = [] yTrain = [] for i in range(len(listOfTrainComments)): BOW = generateBOW(listOfTrainComments[i], listOfUniqueTokens) xTrain.append(BOW) yTrain.append(listOfTrainComments[i].getStatus()) xTest = [] yTest = [] for i in range(len(listOfTestComments)): BOW = generateBOW(listOfTestComments[i], listOfUniqueTokens) xTest.append(BOW) yTest.append(listOfTestComments[i].getStatus()) clf = BernoulliNB() clf.fit(xTrain, yTrain) accUsingSklearn = clf.score(xTest, yTest) print('Bernoulli Naive Bayes Classifier, Accuracy - ' + str(round(accUsingSklearn*100, 2)) + '%', '\n')
def classify(opts, data_train, labels_train): # ##### TRAIN THE MODEL ###################################### # Initialize the corresponding type of the classifier and train it (using 'fit') if opts.classifier == 'nb': classifier = BernoulliNB(binarize=None) elif opts.classifier == 'lr': classifier = LinearRegression() elif opts.classifier == 'log': classifier = LogisticRegression() elif opts.classifier == 'svm': classifier = LinearSVC() else: raise Exception('Unrecognized classifier!') classifier.fit(data_train, labels_train) #all np # ###### VALIDATE THE MODEL ################################## # Print training mean accuracy accuracy = classifier.score(data_train, labels_train) # if opts.verbose: # print "accuracy = ", accuracy # ############################################################ # Predict labels for the test set # labels_predicted = classifier.predict(data_test) # labels_predicted = np.round(labels_predicted, 2) #round to hundredths place for readability # # print "***************" # if opts.verbose: # print "predicted labels:\n", labels_predicted return classifier
import scipy.io data = scipy.io.loadmat('NewsGroup.mat') TRAIN_LABEL = data['TRAIN_LABEL'] TEST_LABEL = data['TEST_LABEL'] import numpy as np split_TEST_DATA = np.load("split_TEST_DATA.npy") split_TRAIN_DATA = np.load("split_TRAIN_DATA.npy") print np.shape(split_TEST_DATA) print np.shape(split_TRAIN_DATA) from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB(alpha=1,fit_prior = True) y_pred = mnb.fit(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0]))) print(mnb.score(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0])))) print(mnb.score(split_TEST_DATA,np.reshape(TEST_LABEL,(np.shape(TEST_LABEL)[0])))) from sklearn.naive_bayes import BernoulliNB bnb = BernoulliNB(alpha=1,fit_prior = True) y_pred = bnb.fit(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0]))) print(bnb.score(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0])))) print(bnb.score(split_TEST_DATA,np.reshape(TEST_LABEL,(np.shape(TEST_LABEL)[0]))))
def run_bernoulli_naive_bayes(training_data,training_target,testing_data,testing_target): clf = BernoulliNB() clf.fit(training_data,training_target) return clf.score(testing_data,testing_target)
def main(): ##### DO NOT MODIFY THESE OPTIONS ########################## parser = argparse.ArgumentParser() parser.add_argument('-training', required=True, help='Path to training data') parser.add_argument('-business_file', required=True, help='Path to business data') parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm') parser.add_argument('-top', type=int, help='Number of top features to show') parser.add_argument('-test', help='Path to test data') opts = parser.parse_args() ############################################################ ##### BUILD TRAINING SET ################################### # Initialize CountVectorizer # You will need to implement functions in tokenizer.py tokenizer = Tokenizer() vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace', tokenizer=tokenizer) csv_file = open(opts.training) file_reader = csv.reader(csv_file) tweets = [] lable = [] for line in file_reader: tweets.append(line[2]) lable.append(int(line[1])) vocabulary = vectorizer.fit_transform(tweets) #print tweets lable = np.array(lable) #print lable # Load training text and training labels # (make sure that your labels are converted to integers (0 or 1, not '0' or '1') # so that we can enforce the condition that label data is binary) # Get training features using vectorizer # Transform training labels to numpy array (numpy.array) ############################################################ ##### TRAIN THE MODEL ###################################### # Initialize the corresponding type of the classifier and train it (using 'fit') if opts.classifier == 'nb': classifier = BernoulliNB(binarize=None) classifier.fit(vocabulary, lable) elif opts.classifier == 'log': classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) classifier.fit(vocabulary, lable) elif opts.classifier == 'svm': classifier = LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None) classifier.fit(vocabulary, lable) else: raise Exception('Unrecognized classifier!') ############################################################ ###### VALIDATE THE MODEL ################################## # Print training mean accuracy using 'score' print ("Training accuracy: %f" % classifier.score(vocabulary, lable)) # Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy' # and print the mean score and std deviation scores = cross_validation.cross_val_score(classifier, vocabulary, lable, scoring = 'accuracy', cv=10) print("Cross-Validation Accuracy: %f (+/- %f)" % (scores.mean(), scores.std())) ############################################################ ##### EXAMINE THE MODEL #################################### if opts.top is not None: # print top n most informative features for positive and negative classes print 'Most informative features' util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top) ############################################################ ##### TEST THE MODEL ####################################### if opts.test is None: # Test the classifier on one sample test tweet # Tim Kraska 10:43 AM - 5 Feb 13 test_tweet = 'Water dripping from 3rd to 1st floor while the firealarm makes it hard to hear anything. BTW this is the 2nd leakage. Love our new house' terms = vectorizer.transform([test_tweet]) # Print the predicted label of the test tweet print classifier.predict(terms) # Print the predicted probability of each label. if opts.classifier != 'svm': # Use predict_proba print classifier.predict_proba(terms) else: # Use decision_funcion print classifier.decision_function(terms) else: # Test the classifier on the given test set # Extract features from the test set and transform it using vectorizer csv_file = open(opts.test) file_reader = csv.reader(csv_file) test_tweets = [] true_lable = [] business = [] for line in file_reader: business.append(line[0]) test_tweets.append(line[2]) true_lable.append(int(line[1])) terms = vectorizer.transform(test_tweets) true_lable = np.array(true_lable) predict_lable = classifier.predict(terms) # Print test mean accuracy accuracy = (len(true_lable) - sum(true_lable^predict_lable))/len(true_lable) print ("Test accuracy: %f" % accuracy) # Predict labels for the test set # Print the classification report target_names = ['Negative', 'Positive'] if opts.classifier != 'svm': test_predicted_proba = classifier.predict_proba(terms) util.plot_roc_curve(true_lable, test_predicted_proba) positive_prob = [] negative_prob = [] for i, item in enumerate(true_lable): if true_lable[i] == 1: positive_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]]) else: negative_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]]) sorted_positive = sorted(positive_prob, key=itemgetter(1), reverse= True) positive_bias = sorted_positive[0:100] sorted_negative = sorted(negative_prob, key=itemgetter(1)) negative_bias = sorted_negative[0:100] bfile = open(opts.business_file, 'r') bdic = {} for line in bfile: line = json.loads(line) bdic[line['business_id']] = [line['name'], line['full_address']] positive = open('positive_bias.csv', 'w') writer_positive = csv.writer(positive) negative = open('negative_bias.csv', 'w') writer_negative = csv.writer(negative) for item in positive_bias: writer_positive.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1])) for item in negative_bias: writer_negative.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1])) '''
def main(): logger.info('Started') #============================================================================ #Establish connection and make database object #============================================================================ db = EstablishConnection() bill_table = db.ca_bills #bill table bill_d_table = db.bills_details #bill details table legislator_table = db.legislators #legislator table committee_table = db.committees #committee table #================================================================================================== #Query MongoDB to pull relevant data #================================================================================================== # try: # bills_details = list(db.bills_details.find({'state':'ca', 'type': 'bill'}, # {'_id': 1, 'session':1, 'chamber': 1, 'sponsors': 1, 'sponsors.leg_id':1, 'scraped_subjects': 1, 'subjects':1, 'type': 1, # 'action_dates': 1, 'votes': 1, 'actions': 1}).limit(10000) ) # legis_details = list(db.legislators.find({'state': 'ca','level':'state'}, # {'_id': 1,'leg_id': 1,'party': 1,'district': 1,'active': 1 ,'chamber': 1}).limit(10000) ) # logger.info('Data succesfully obtained from MongoDB.\n') # except: # logger.info('Something went with wrong Querying MongoDB.\n') # pass bills_details = list(db.bills_details.find({'state':'ca', 'type': 'bill'}, {'_id': 1, 'session':1, 'chamber': 1, 'sponsors': 1, 'sponsors.leg_id':1, 'scraped_subjects': 1, 'subjects':1, 'type': 1, 'action_dates': 1, 'votes': 1, 'actions': 1}).limit(5000) ) legis_details = list(db.legislators.find({'state': 'ca','level':'state'}, {'_id': 1,'leg_id': 1,'party': 1,'district': 1,'active': 1 ,'chamber': 1}).limit(5000) ) logger.info('Data succesfully obtained from MongoDB.\n') logger.info('Creating legis dataframe...........\n') df_legis = pd.DataFrame(legis_details) df_bills_d = pd.DataFrame(bills_details) logger.info('Finished creating DataFrame........\n') logger.info('Uploading median income by district data') fnames = np.array(['locations', 'district', 'chamber', 'med_ann_income']) income_df = pd.read_csv('/Users/ppchow/data_science/CA-leg-predict/Med_Family_Income_20082012.csv', names=fnames) legis_income_df = pd.merge(income_df, df_legis, on=['chamber', 'district'], how='right') legis_income_df = legis_income_df.drop(['_id', 'district', 'chamber'], axis=1) logger.info('Combined legislation and income dataframes') logger.info('Apply transformation to DataFrame......\n') df_bills_d['bill_duration'] = df_bills_d['action_dates'].apply(lambda lst: billDuration(lst)) df_bills_d['bill_status'] = df_bills_d['actions'].map(lambda lst: billStatus(lst)) df_bills_d['primary_sponsors'] = df_bills_d['sponsors'].map(lambda lst: primarySponsors(lst)) df_bills_d['co_sponsors'] = df_bills_d['sponsors'].map(lambda lst: coSponsors(lst)) df_bills_d['leg_id'] = df_bills_d['sponsors'].map(lambda lst: lst[0]['leg_id']) df_bills_d = df_bills_d.drop(['action_dates', 'actions', 'session', 'subjects', 'scraped_subjects', 'votes', 'type', 'sponsors'], axis = 1) df_bills_d.fillna(0, inplace = True) df_bills_d_merged = pd.merge(legis_income_df, df_bills_d, on='leg_id', how='outer') print 'Prints Merged Dataframe', df_bills_d_merged logger.info('Done applying transformation to DataFrame........\n') #=============================================================================== # APPLY NAIVE BAYES MODEL TO DATAFRAME #=============================================================================== df_bills_d_merged.describe() df_bills_d_merged[df_bills_d_merged['bill_status'] == 1 ].describe() df_bills_d_merged.head() y, X = dmatrices('bill_status ~ bill_duration + primary_sponsors + co_sponsors + locations + party - 1', data=df_bills_d_merged, return_type='dataframe') yy = y['bill_status[yes]'] clf = BernoulliNB().fit(X, yy) print clf.intercept_ print math.exp(clf.intercept_) print 'NB Score/R2', clf.score(X,yy) print "Coefs", clf.coef_[0] top = np.argsort(clf.coef_[0]) print top print clf.coef_[0][top] print 'X.columns top', X.columns[top]
def classify(data, Sensoren, classifier="Bayes"): X_train, X_test, y_train, y_test = cross_validation.train_test_split( data[:, Sensoren], data[:, 1], test_size=0.4, random_state=0 ) X_train = data[:7000, Sensoren] y_train = data[:7000, 1] X_test = data[7000:, Sensoren] y_test = data[7000:, 1] # Auswahl des Klassifizierers if classifier is "Bayes": clf = BernoulliNB() history = "Klassifizierer: Naive Bayes" + "\n" elif classifier is "Gradient": clf = SGDClassifier() history = "Klassifizierer: Gradient Decent" + "\n" elif classifier is "Linear": clf = linear_model.LinearRegression() history = "Klassifizierer: Linear Regression" + "\n" elif classifier is "LDA": clf = LDA() history = "Klassifizierer: LDA" + "\n" elif classifier is "AdaBoost": clf = AdaBoostClassifier(n_estimators=100) history = "Klassifizierer: AdaBoost" + "\n" elif classifier is "Forest": clf = RandomForestClassifier(n_estimators=100) history = "Klassifizierer: Forest" + "\n" elif classifier is "SVM": clf = svm.SVC() history = "Klassifizierer: SVN" + "\n" elif classifier is "DecisionTree": clf = tree.DecisionTreeClassifier(criterion="entropy") history = "Klassifizierer: DecisionTree" + "\n" else: print "kein korrekter Klassifizierer gewawehlt,Naive Bayes wurde verwendet" history = "Klassifizierer: Fehler" + "\n" clf = GaussianNB() # Trainieren des Klassifitierers clf.fit(X_train, y_train) lista = clf.predict(X_test) - y_test lista = map(abs, lista) b = [1 if i else 0 for i in lista] score = clf.score(X_test, y_test) confusionMatrix = confusion_matrix(y_test, clf.predict(X_test)) print "Fehlerkennung: " + str(sum(b)) print "Score: " + str(score) print confusionMatrix history = history + "Score: " + str(score) + "\n" history = history + "Fehlerkennung: " + str(sum(b)) + "\n" history = history + "Confusionsmatrix: " + "\n" history = history + str(confusionMatrix) + "\n" fd = open("History.txt", "a") fd.write(history) fd.close() return clf, X_train, X_test, y_train, y_test
def main(): parser = argparse.ArgumentParser() parser.add_argument('-training', required=True, help='Path to training data') parser.add_argument('-test', help='Path to test data') parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm') parser.add_argument('-top', type=int, help='Number of top features to show') parser.add_argument('-trees',type=int,help="Number of trees (if random forest for classifier)") opts = parser.parse_args() ##### BUILD TRAINING SET ################################### # Initialize CountVectorizer vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace') # Load training text and training labels # (make sure to convert labels to integers (0 or 1, not '0' or '1') # so that we can enforce the condition that label data is binary) count = 0 with open(opts.training, 'rU') as f: reader = csv.reader(f) train_data = list(reader) train_labels = numpy.arange(len(train_data)) train_text = [] i = 0 for blog in train_data: label = blog[0] text = blog[1] train_text.append(text) train_labels[i] = int(label) i+=1 print("ready to vectorize training data") # Get training features using vectorizer train_features = vectorizer.fit_transform(train_text) # Transform training labels to numpy array (numpy.array) print("done vectorizing") ############################################################ ##### TRAIN THE MODEL ###################################### # Initialize the corresponding type of the classifier and train it (using 'fit') if opts.classifier == 'nb': classifier = BernoulliNB(binarize=None) print("Naive Bayes") elif opts.classifier == 'log': classifier = LogisticRegression(C=.088) print("Log") elif opts.classifier == 'svm': classifier = LinearSVC() print("Support Vector Machine") elif opts.classifier == 'rf': if not opts.trees: trees = 10 else: trees = opts.trees classifier = RandomForestClassifier(n_estimators=trees) train_features = train_features.toarray() elif opts.classifier == 'knn': classifier = KNeighborsClassifier(n_neighbors=10) else: raise Exception('Unrecognized classifier!') classifier.fit(train_features,train_labels) ############################################################ ###### VALIDATE THE MODEL ################################## # Print training mean accuracy using 'score' print(classifier.score(train_features,train_labels)) scores = cross_validation.cross_val_score(classifier,train_features,train_labels,cv=10,scoring='accuracy') print("Cross Validation Scores Calculated") print(scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())) ############################################################ ##### EXAMINE THE MODEL #################################### if opts.top is not None: print("Got "+str(opts.top)+" tops") # print top n most informative features for positive and negative classes util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top) ############################################################ ##### TEST THE MODEL ####################################### if opts.test is None: test_blog = "uses yahoo boss support search experience general web search perform query application set term candidates using key terms term its within result set its global measure similar 1ST_PERSON former colleagues 1ST_PERSON enterprise try yourself URL rough edges produces considering example 1ST_PERSON application explore learn 1ST_PERSON started 1ST_PERSON term 1ST_PERSON suggestions looked name caught 1ST_PERSON following 1ST_PERSON 1ST_PERSON again results 1ST_PERSON immediately had document further made clear someone 1ST_PERSON get home can_t you_ll experience 1ST_PERSON did 1ST_PERSON encourage" # Print the predicted label of the test blog features = vectorizer.transform([test_blog]) if opts.classifier == 'rf': features = features.toarray() print("Prediction (1 == correct): ") print(classifier.predict(features)) # Print the predicted probability of each label. if opts.classifier != 'svm': # Use predict_proba print("User predict prob ") print(classifier.predict_proba(features)) else: # Use decision_function print("use decision ") print(classifier.decision_function(features)) else: with open(opts.test, 'rb') as f: reader = csv.reader(f) test_data = list(reader) test_labels = numpy.arange(len(test_data)) test_text = [] i = 0 for blog in test_data: label = blog[0] text = blog[-1] test_text.append(text) test_labels[i] = int(label) i+=1 print("ready to vectorize testing data") # Get training features using vectorizer test_features = vectorizer.transform(test_text) print("Score") print(classifier.score(test_features,test_labels)) # Test the classifier on the given test set # Extract features from the test set and transform it using vectorizer # Print test mean accuracy # Predict labels for the test set predictions = classifier.predict(test_features) # Print the classification report print("Classification report") print(classification_report(test_labels,predictions)) # Print the confusion matrix print("Classifier uses: Confusion!") print(confusion_matrix(test_labels,predictions)) print("It's super effective!") # Get predicted label of the test set if opts.classifier != 'svm': print("Predicted Probability") test_predicted_proba = classifier.predict_proba(test_features) blogs = zip(test_labels,predictions,test_predicted_proba,test_text) num = len(blogs) counter = 0 """for tup in reversed(sorted(blogs,key=lambda x:x[2][1])): if tup[0] == tup[1]: if counter < 5: print(tup) counter+=1 counter = 0 for tup in reversed(sorted(blogs,key=lambda x:x[2][0])): if tup[0] == tup[1]: if counter < 5: print(tup) counter+=1""" util.plot_roc_curve(test_labels, test_predicted_proba) else: print("Decision Function") decisions = classifier.decision_function(test_features) #import matplotlib.pyplot as plt x = numpy.arange(0,len(decisions),1) plt.plot(x,decisions) plt.show()
from sklearn.metrics import mean_squared_error from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix import pandas as pd df = pd.read_csv('dataset/winequality-red.csv', header=0, sep=';') X = df[list(df.columns)[:-1]] y = df['quality'] X_train, X_test, y_train, y_test = train_test_split(X, y) modelg = GaussianNB() modelg.fit(X_train, y_train) y_predict = modelg.predict(X_test) print "GausseanNB Score:" + str(modelg.score(X_test, y_test)) mse = mean_squared_error(y_predict, y_test) print "RMSE:" + str(mse ** 0.5) modelm = MultinomialNB() modelm.fit(X_train, y_train) y_predict = modelm.predict(X_test) print "MultinomialNB Score:" + str(modelm.score(X_test, y_test)) mse = mean_squared_error(y_predict, y_test) print "RMSE:" + str(mse ** 0.5) modelb = BernoulliNB() modelb.fit(X_train, y_train) y_predict = modelb.predict(X_test) print "BernoulliNB Score: " + str(modelb.score(X_test, y_test)) mse = mean_squared_error(y_predict, y_test) print "RMSE:" + str(mse ** 0.5)
def main(): ##### DO NOT MODIFY THESE OPTIONS ########################## parser = argparse.ArgumentParser() parser.add_argument('-training_expensive', required=True, help='Path to expensive training data') parser.add_argument('-training_cheap', required=True, help='Path to cheap training data') # parser.add_argument('-test', help='Path to test data') parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm') parser.add_argument('-top', type=int, help='Number of top features to show') parser.add_argument('-p', type=bool, default='', help='If true, prints out information') opts = parser.parse_args() ############################################################ # Note: anytime the print flag is set to '', you should not print anything out! ##### BUILD TRAINING SET ################################### # Load training text and training labels (training_labels, training_features) = load_file(opts.training_expensive, opts.training_cheap) # print training_labels # print training_features # Transform training labels to numpy array (numpy.array) training_labels = numpy.array(training_labels) training_features = numpy.array(training_features) ############################################################ # TODO: Start modifiying the lines below here ##### TRAIN THE MODEL ###################################### # Initialize the corresponding type of the classifier and train it (using 'fit') if opts.classifier == 'nb': # TODO: Initialize Naive Bayes and train classifier = BernoulliNB(binarize=None) classifier.fit(training_features, training_labels) elif opts.classifier == 'log': # TODO: Initialize Logistic Regression and train classifier = LogisticRegression(penalty='l2') classifier.fit(training_features, training_labels) elif opts.classifier == 'svm': # TODO: Initialize SVM and train classifier = LinearSVC() classifier.fit(training_features, training_labels) else: raise Exception('Unrecognized classifier!') ############################################################ ###### VALIDATE THE MODEL ################################## # TODO: print training mean accuracy using 'score' # TODO: Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy' # TODO: print get the mean score and std deviation ############################################################ if opts.p == True: print "training mean accuracy using score " + str(classifier.score(training_features, training_labels)) est_scores = cross_validation.cross_val_score(classifier, training_features, training_labels, scoring='accuracy', cv=10) mean_est_scores = numpy.mean(est_scores) std_est_scores = numpy.std(est_scores) if opts.p == True: print "10 fold cross training mean accuracy " + str(mean_est_scores) print "10 fold cross training standard deviation " + str(std_est_scores)
test_texts = [item[0] for item in test_x_y] test_labels = [item[1] for item in test_x_y] # Get test features using vectorizer test_features = vectorizer.transform(test_texts) # Transform test labels to numpy array (numpy.array) test_labels = numpy.array(test_labels) predicted = classifier.predict(test_features) cm = confusion_matrix(test_labels, predicted) print cm cm_normalized = cm.astype('float') / cm.sum(axis=1)[:,numpy.newaxis] print cm_normalized plt.figure() plot_confusion_matrix(cm_normalized) plt.show() test_scores = classifier.score(test_features, test_labels) print 'Mean Score: ', test_scores.mean() ############################################################ print 'Train Number: ', len(train_x_y) print 'Test Number: ', len(test_x_y)
NuSVC_classifier.fit(train_arrays, train_labels) print('NuSVC Accuracy: %.2f' %NuSVC_classifier.score(test_arrays, test_labels)) except: pass try: MultinomialNB_classifier = MultinomialNB() MultinomialNB_classifier.fit(train_arrays, train_labels) print('MultinomialNB Accuracy: %.2f' %MultinomialNB_classifier.score(test_arrays, test_labels)) except: pass try: BernoulliNB_classifier = BernoulliNB() BernoulliNB_classifier.fit(train_arrays, train_labels) print('BernoulliNB Accuracy: %.2f' %BernoulliNB_classifier.score(test_arrays, test_labels)) except: pass try: GaussianNB_classifier = GaussianNB() GaussianNB_classifier.fit(train_arrays, train_labels) print('GaussianNB Accuracy: %.2f' %GaussianNB_classifier.score(test_arrays, test_labels)) except: pass ################################################################################# Confusion_matrix from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt from itertools import cycle
#mnb = MultinomialNB() #mnb.fit(tfidf_train, svm_train_tag) #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) #score = mnb.score(tfidf_test, svm_test_tag) #print score binarizes=np.linspace(-0.007,-0.008,3)#best -0.0075 for b in binarizes: bnb = BernoulliNB(alpha=1.0, binarize=b, class_prior=[0.5047619048,0.4952380952], fit_prior=True) bnb.fit(svm_train_data, svm_train_tag) bnb_predict=bnb.predict(svm_test_data) test_score=bnb.predict_proba(svm_test_data) precision, recall, thresholds = precision_recall_curve(svm_test_tag, bnb_predict) bnb_fpr,bnb_tpr,bnb_thr=roc_curve(svm_test_tag,test_score[:,1]) bnb_auc = auc(bnb_fpr, bnb_tpr) plt.figure() plt.plot(bnb_fpr, bnb_tpr, lw = 1) #plt.legend(loc = 'lower right') plt.title("ROC curve of naive Bayes classifier") plt.show() # #BernoulliNB(alpha=1.0, binarize=0.5, class_prior=None, fit_prior=True) score=bnb.score(svm_test_data,svm_test_tag) print "BernoulliNB,",b print "confusion matrix:","\n",confusion_matrix(svm_test_tag, bnb_predict) print "score=",score print "precision=",precision[1] print "recall=",recall[1] print "auc=",bnb_auc print "\n"
def compare_sklearn(self, np_reps, gold): X = np.array(np_reps) nb = BernoulliNB(alpha=0) y = np.array(gold) nb.fit(X,y) return nb.score(X,y)
#tf = TfidfVectorizer(sublinear_tf = True, analyzer='word', ngram_range=(1,3), lowercase=True, min_df=0, stop_words='english') tf = TfidfVectorizer(ngram_range=(2,2), lowercase=True,min_df=1) features_train = tf.fit_transform(features_train).toarray() features_test = tf.transform(features_test).toarray() print(features_train.size + features_test.size) print(len(labels_train)+ len(labels_test)) features_train = features_train[:1000] labels_train = labels_train[:1000] ###BernoulliNB BernoulliNB = BernoulliNB() BernoulliNB.fit(features_train,labels_train) print("BernoulliNB_classifier accuracy percent:", (BernoulliNB.score(features_test,labels_test))) ###trains a Naive Bayes Classifier #classifier = NaiveBayesClassifier.train(trainFeatures) ###trains a MultinomialNB Classifier MultinomialNB = MultinomialNB() MultinomialNB.fit(features_train,labels_train) print("MNB_classifier accuracy percent:", (MultinomialNB.score(features_test,labels_test))) #LogisticRegression lr = LogisticRegression() lr.fit(features_train,labels_train) print("Logistic Regression_classifier accuracy percent:",(lr.score(features_test,labels_test))) ######
train_data_features = train_data_features.toarray() print train_data_features.shape print "Training the random NB..." # Initialize a Random NB classifier with 100 trees NB = BernoulliNB(alpha = 1000) # Fit the NB to the training set, using the bag of words as # features and the sentiment labels as the response variable # # This may take a few minutes to run NB = NB.fit( train_data_features, train["label"] ) print('accuracy on the training set: %f' %NB.score( train_data_features, train["label"] )) # Read the test data test = pd.read_csv("testDataFormated.csv", header=0, quoting=3 ) # Verify that there are 25,000 rows and 2 columns print test.shape # Create an empty list and append the clean reviews one by one num_test_data = len(test["text"]) clean_test_text = [] print "Cleaning and parsing the test set...\n" for i in xrange(0,num_test_data): if( (i+1) % 1000 == 0 ): print "test %d of %d\n" % (i+1, num_test_data)
train_vectors = vectorizer.fit_transform([doc for doc, target in train_samples]) test_vectors = vectorizer.transform([doc for doc, target in test_samples]) train_targets = [target for doc, target in train_samples] test_targets = [target for doc, target in test_samples] # <codecell> classifier = BernoulliNB() # <codecell> classifier.fit(train_vectors, train_targets) # <codecell> classifier.score(test_vectors, test_targets) # <codecell> # A helper function to see which features affect the classification the most def show_most_informative_features(vectorizer, classifier, n=10): neg = classifier.feature_log_prob_[0] pos = classifier.feature_log_prob_[1] valence = (pos - neg) ordered = np.argsort(valence) interesting = np.hstack([ordered[:n], ordered[-n:]]) feature_names = vectorizer.get_feature_names() for index in ordered[:n]: print "%+4.4f\t%s" % (valence[index], feature_names[index]) print '\t...' for index in ordered[-n:]:
#[trainFiles["neg"]["path"]+x for x in trainFiles["neg"]["files"]]) xTrain = vectorizer.fit_transform([trainFiles["pos"]["path"]+x for x in trainFiles["pos"]["files"]]+ [trainFiles["neg"]["path"]+x for x in trainFiles["neg"]["files"]]) xTest = vectorizer.transform([testFiles["pos"]["path"]+x for x in testFiles["pos"]["files"]]+ [testFiles["neg"]["path"]+x for x in testFiles["neg"]["files"]]) clf =BernoulliNB(alpha=.01) #clf = MultinomialNB(alpha=.01) clf.fit(xTrain, [1]*max+[0]*max) print xTest.get_shape() y_score = clf.predict(xTest) y_prob = clf.predict_proba(xTest) y_test=[1]*199+[0]*199 scores=clf.score(xTest,[1]*199+[0]*199) #scores=clf.score(y_prob[:,0],[1]*200+[0]*200) print roc_auc_score([1]*199+[0]*199, y_prob[:,1], average='macro', sample_weight=None) #from sklearn.externals import joblib joblib.dump(clf, 'pickle/bernouliAdjAdv.pkl') joblib.dump(vectorizer, 'pickle/vecAdjAdv.pkl') # In[7]: fpr = dict() tpr = dict() roc_auc = dict() print 0 fpr, tpr, _ = roc_curve([1]*200+[0]*200, y_prob[:,1])