def naive_bayes(df,column): reviews_pn = df[df['class'].isin(['positive','negative'])] comments = list(reviews_pn[column].values) classes = list(reviews_pn['class'].values) # preprocess creates the term frequency matrix for the review data set stop = stopwords.words('english') count_vectorizer = CountVectorizer(stop_words = stop, ngram_range=(1,3)) comments1 = count_vectorizer.fit_transform(comments) tfidf_comments = TfidfTransformer(use_idf=True).fit_transform(comments1) # preparing data for split validation. 60% training, 40% test data_train,data_test,target_train,target_test = cross_validation.train_test_split(tfidf_comments,classes,test_size=0.4,random_state=43) classifier = BernoulliNB().fit(data_train,target_train) predicted = classifier.predict(data_test) print classification_report(target_test,predicted) print "The accuracy score is {:.2%}".format(accuracy_score(target_test,predicted)) most_informative_feature_for_binary_classification(count_vectorizer,classifier,n=20) #predict on unknown reviews_nc = reviews_df[reviews_df['class'] == ''] comments_nc = list(reviews_nc[column].values) comments_nc1 = count_vectorizer.transform(comments_nc) tfidf_comments_nc = TfidfTransformer(use_idf=True).fit_transform(comments_nc1) new_predicted = classifier.predict(tfidf_comments_nc) print "negative = %s" %sum(new_predicted == 'negative') print "positive = %s" %sum(new_predicted == 'positive')
def main(): # Get the data and targets df = pd.read_csv('train1.csv') df = df[df.rating != 'rating'] corpus = [review for review in df.review] splitPoint = len(corpus)*2/3 trainingCorpus = corpus[:splitPoint] testCorpus = corpus[splitPoint:] target = [rating for rating in df.rating] trainingTarget = np.array(target[:splitPoint]) testTarget = np.array(target[splitPoint:]) # Train the algorithm train_X, vocabList = createVectorizer(trainingCorpus, 'None', True) NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget) # Test the algorithm test_X = createVectorizer(testCorpus, vocabList, True) test_predict = NB_Bern_model.predict(test_X) print(np.mean(test_predict == testTarget)) print metrics.classification_report(testTarget, test_predict, target_names=['0', '1']) # Make Predictions predict_df = pd.read_csv('test2.csv') predictCorpus = [review for review in predict_df.review] member = [memberid for memberid in predict_df.ID] predict_X = createVectorizer(predictCorpus, vocabList, True) predictions = NB_Bern_model.predict(predict_X) predict_df.columns = ['ID', 'Predicted'] for i in range(len(member)): predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i] predict_df.to_csv('submission1.csv', sep = ',', index=False)
def generatePredictingModel(data): """ Build the prediction model (based on the data set we have) in order to be able to predict the category of a new video from the user input Return a classifier able to predict the category of a video based on its title and description. """ try: # Intitialize a timer to compute the time to build the model start = time.time() # Split into train-test data set X = data[[x for x in data.columns if x in ('title', 'description')]] Y = data[[x for x in data.columns if x in ('video_category_id')]] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10) # Build the 2 text corpus corpus_title = X_train['title'].values.tolist() corpus_description = X_train['description'].values.tolist() # initializes the 2 vectorizers. count_vectorizer_title = CountVectorizer() count_vectorizer_description = CountVectorizer() # learn the 2 vocabulary dictionary count_vectorizer_title.fit(corpus_title) count_vectorizer_description.fit(corpus_description) # Build the sparse matrices X_train_count_title = count_vectorizer_title.transform(X_train['title']) X_train_count_description = count_vectorizer_description.transform(X_train['description']) X_test_count_title = count_vectorizer_title.transform(X_test['title']) X_test_count_description = count_vectorizer_description.transform(X_test['description']) # Set and train the models (for title and description features) model_count_title = BernoulliNB() model_count_description = BernoulliNB() model_count_title.fit(X_train_count_title, Y_train['video_category_id']) model_count_description.fit(X_train_count_description, Y_train['video_category_id']) # Merge the title and description predictions and build a new prediction based on these 2 predictions combined new_df_train = pd.DataFrame() new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title) new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description) new_df_test = pd.DataFrame() new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title) new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description) tree = DecisionTreeClassifier() tree.fit(new_df_train, Y_train) end = time.time() execution_time = end - start print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time) time.sleep(3) return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description except: raise VideoAnalysisException(" Error while creation of predictive model ")
def testBoGNB(self): ''' Test on sentiment analysis task using Naive Bayes classifier with Bag-of-Word feature vectors. ''' wordlist = [] # Preprocessing of original txt data set for i, sent in enumerate(self.senti_train_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] wordlist.extend(words) for i, sent in enumerate(self.senti_test_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] wordlist.extend(words) word_dict = set(wordlist) word2index = dict(zip(word_dict, range(len(word_dict)))) # Build BoG feature train_size = len(self.senti_train_txt) test_size = len(self.senti_test_txt) pprint('Training set size: %d' % train_size) pprint('Test set size: %d' % test_size) train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float) test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float) # Using binary feature start_time = time.time() for i, sent in enumerate(self.senti_train_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] indices = map(lambda x: word2index[x], words) train_feat[i, indices] = 1.0 for i, sent in enumerate(self.senti_test_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] indices = map(lambda x: word2index[x], words) test_feat[i, indices] = 1.0 end_time = time.time() pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time)) pprint('Classification using Bernoulli Naive Bayes classifier: ') clf = BernoulliNB() # clf = LogisticRegression() clf.fit(train_feat, self.senti_train_label) train_pred_label = clf.predict(train_feat) train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size) pprint('Training accuracy = %f' % train_acc) pred_label = clf.predict(test_feat) acc = np.sum(pred_label == self.senti_test_label) / float(test_size) pprint('Accuracy: %f' % acc) train_pos_count = np.sum(self.senti_train_label == 1) train_neg_count = np.sum(self.senti_train_label == 0) test_pos_count = np.sum(self.senti_test_label == 1) test_neg_count = np.sum(self.senti_test_label == 0) pprint('Positive count in training set: %d' % train_pos_count) pprint('Negative count in training set: %d' % train_neg_count) pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count)) pprint('Positive count in test set: %d' % test_pos_count) pprint('Negative count in test set: %d' % test_neg_count) pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))
def train(neg=None, pos=None): the_file = os.path.dirname(os.path.abspath(__file__)) if not neg: neg = os.path.join(the_file, '..', 'origin', 'neg.txt') if not pos: pos = os.path.join(the_file, '..', 'origin', 'pos.txt') tagger = crfseg.create_tagger() tok_cn = lambda (x): crfseg.cut_zh(x, tagger) tfidf = TfidfVectorizer(tokenizer=tok_cn, sublinear_tf=True, max_df=0.5) pipe = Pipeline([ ('tfidf', tfidf), # ('svd', TruncatedSVD(32)), # ('normal', Normalizer(copy=False)) ]) ''' hasher = HashingVectorizer(n_features=2**16, tokenizer=tok_cn, non_negative=True, norm=None, binary=False) ''' #clf = SGDClassifier(loss='log', penalty='l2', alpha=0.00001, n_iter=50, fit_intercept=True) #clf = MultinomialNB() clf = BernoulliNB() neg_file = codecs.open(neg, 'r', 'utf-8') pos_file = codecs.open(pos, 'r', 'utf-8') x_train = [] y_train = [] i = 0 for line in neg_file: x_train.append(line) y_train.append(0) for line in pos_file: x_train.append(line) y_train.append(1) print 'begin transform' #x_train = hasher.transform(x_train) x_train = pipe.fit_transform(x_train) print 'begin fit' clf.fit(x_train, y_train) print 'begin save' tfidf_file = os.path.join(the_file, 'data', 'tfidf.pkl') clf_file = os.path.join(the_file, 'data', 'sgdc_clf.pkl') #_ = joblib.dump(tfidf, tfidf_file, compress=9) _ = joblib.dump(clf, clf_file, compress=9) print 'begin test' x_test = [u'这个东西真心很赞'] #x_test = hasher.transform(x_test) x_test = pipe.transform(x_test) print clf.predict(x_test)
def main(): start_time = time.time() #read in game IDs games_data = pd.read_csv('games-data.csv') all_games = np.array(games_data['game_id']) all_plyrs = np.array(games_data['plyr_id']) uni_game_ids = np.unique(all_games) #read in player IDs player_data = pd.read_csv('players.csv') plyr_ids = np.unique(np.array(player_data['ID'])) #read in fantasy scores fantasy_scores = pd.read_csv('fantasy_scores.csv') #gets player training matrix plyr_id = 8439 X = create_training_set(plyr_id, games_data, plyr_ids) index = get_ninety_percent(len(np.array(X.index))) #for cross-validation train_X = X[:index] test_X = X[index:] #gets training output vector plyr_game_ids = np.array(train_X.index) scores = plyr_fantasy_pts(plyr_id, plyr_game_ids, fantasy_scores) Y = discretize(scores.values) train_Y = Y[:index] test_Y = Y[index:] #run Bernoulli NB Classifier nb_clf = BernoulliNB() nb_clf.fit(train_X, train_Y) nb_predictions = nb_clf.predict(test_X) #run Multinomial NB Classifier mn_clf = MultinomialNB() mn_clf.fit(train_X, train_Y) mn_predictions = nb_clf.predict(test_X) #test for game, fantasy score alignment for i in xrange(test_Y.shape[0]): print plyr_game_ids[i], scores.values[i], test_Y[i], nb_predictions[i], mn_predictions[i] print "Bernoulli NB accuracy: ", nb_clf.score(test_X, test_Y) print "Bernoulli NB prob estimates: ", nb_clf.predict_proba(test_X) print "Multinomial NB accuracy: ", mn_clf.score(test_X, test_Y) print "Bernoulli NB prob estimates: ", mn_clf.predict_proba(test_X) print len(nb_clf.predict_proba(test_X)[0]) nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) vals = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5] ev = expected_val(nb_norm_prob, vals) print "EV: ", ev end_time = time.time() print("Elapsed time was %g seconds" % (end_time - start_time))
class MuscleClassifier(): def __init__(self, auto_load=True): """ Initializes our MuscleClassifier Option to preload it or start from fresh model """ #=====[ If auto_load, then we rehydrate our existing models ]===== if auto_load: self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r')) self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r')) self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r')) else: self.model = BernoulliNB() def train(self, muscle_groups, labels): """ Vectorizes raw input and trains our classifier """ #=====[ Instantiate label encoder to turn text labels into ints ]===== self.le = preprocessing.LabelEncoder() #=====[ Declare vectorizers and merge them via a FeatureUnion ]===== char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8') word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8') self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)]) #=====[ Transform our input and labels ]===== X = self.vectorizer.fit_transform(muscle_groups).toarray() Y = self.le.fit_transform(labels) #=====[ Fit our model and then run inference on training data ]===== self.model.fit(X,Y) y = self.model.predict(X) #=====[ Report Traning Accuracy ]===== print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y))) def predict(self, exercises): """ Takes in raw input, vectorizes it, and reports back predicted muscle group """ X = self.vectorizer.transform(exercises).toarray() y = self.model.predict(X) return self.le.classes_[y]
def bnb_baseline(bow_train, train_labels, bow_test, test_labels): # training the baseline model binary_train = (bow_train>0).astype(int) binary_test = (bow_test>0).astype(int) model = BernoulliNB() model.fit(binary_train, train_labels) #evaluate the baseline model train_pred = model.predict(binary_train) print('BernoulliNB baseline train accuracy = {}'.format((train_pred == train_labels).mean())) test_pred = model.predict(binary_test) print('BernoulliNB baseline test accuracy = {}'.format((test_pred == test_labels).mean())) return model
def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target): clf = BernoulliNB() clf.fit(train_predictors,train_target) predicted = clf.predict(test_predictors) accuracy = accuracy_score(test_target, predicted) print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy) return accuracy,predicted
def test_discretenb_predict_proba(): """Test discrete NB classes' probability scores""" # The 100s below distinguish Bernoulli from multinomial. X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]] X_multinomial = [[0, 1], [1, 3], [4, 0]] # Confirm that the 100s above distinguish Bernoulli from multinomial y = [0, 0, 1] cls_b = BernoulliNB().fit(X_bernoulli, y) cls_m = MultinomialNB().fit(X_bernoulli, y) assert_not_equal(cls_b.predict(X_bernoulli)[-1], cls_m.predict(X_bernoulli)[-1]) # test binary case (1-d output) y = [0, 0, 2] # 2 is regression test for binary case, 02e673 for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict(X[-1]), 2) assert_equal(clf.predict_proba(X[0]).shape, (1, 2)) assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1), np.array([1., 1.]), 6) # test multiclass case (2-d output, must sum to one) y = [0, 1, 2] for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict_proba(X[0]).shape, (1, 3)) assert_equal(clf.predict_proba(X[:2]).shape, (2, 3)) assert_almost_equal(np.sum(clf.predict_proba(X[1])), 1) assert_almost_equal(np.sum(clf.predict_proba(X[-1])), 1) assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1) assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
def train_model(data, target): """ Splits the data into a training set and test set Instatiating a Bernoulli Naive Bayes classifier, train on the training set, and then evaluate the model based upon the test set """ # Using cross-validation # TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before # sampling in order to improve the representativeness of the sampling train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data, target, test_size=0.4) # Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment classifier = BernoulliNB().fit(train_tweets, train_sentiment) predicted = classifier.predict(validation_tweets) # Using the cross-validation split, evaluate the accuracy of the predicted tweets evaluate_model(validation_sentiment, predicted) # Pickling the classifier pickle_file = open('nb_classifier.pickle', 'wb') pickle.dump(classifier, pickle_file) pickle_file.close() return classifier
def predict(cur, plyr_id, game_plyrs): #creates training set (called 'X') for plyr all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches) games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in n_cols = all_plyrs.shape[0] #int m_rows = games.shape[0] #int zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe populate_training_set(cur, X, games, plyr_id) print "X: ", X.values #creates vector of known output values Y = training_output_vector(cur, games, plyr_id) print "(len) Y: ", len(Y), Y test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe update_training_matrix(game_plyrs, 0, test_X) #run Bernoulli NB Classifier nb_clf = BernoulliNB() if len(X.values) == 0: return 0 nb_clf.fit(X, Y) nb_predictions = nb_clf.predict(test_X) print "test_X: ", test_X.values nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) avgs = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5] print "param vector: ", nb_clf.predict_proba(test_X)[0] print "probs: ", nb_norm_prob print avgs ev = expected_val(nb_norm_prob, avgs) #can also calc dot product return round(ev, 1)
def test_BernouliNB2(): X = np.array([ [0, 1], [1, 1], [1, 0], [-1, 1], [1000, 1000], [1000, 10001], [998, 800], [990, 1100], ] ) print 'X ' + str(X) #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) Y = np.array([1, 2, 3, 4, 5, 6, 7, 8]) print 'Y ' + str(Y) clf = BernoulliNB(alpha = 1) clf.fit(X, Y) X2 = np.array( [ [1002, 1010], [1010, 910], [1003, 980], [1008, 1030], [-1, -1], [-3, -10], [40, 1], [1, -100], ] ) for i in xrange(len(X2)): #pred_ret = clf.predict_proba(X2[i]) pred_ret = clf.predict(X2[i]) print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
def test_BernouliNB4(): X = np.array([ [1, 1], [1, 1], [1, 1], [1, 0], [1, 0], [1, 0], [1, 0], [0, 0], [0, 0], [1, 0], ] ) print 'X ' + str(X) #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0]) print 'Y ' + str(Y) clf = BernoulliNB(alpha = 1) clf.fit(X, Y) X2 = np.array( [ [1, 1], ] ) for i in xrange(len(X2)): #pred_ret = clf.predict_proba(X2[i]) pred_ret = clf.predict(X2[i]) print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)
def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs): # Implementing classification model- using BernoulliNB clf_BNB = BernoulliNB(alpha=.01) clf_BNB.fit(data_train_vectors, data_train.target) y_pred = clf_BNB.predict(data_test_vectors) return y_pred
class NaiveBayesClassifierBernoulli: """ this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class """ def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath): self.X,self.Y = load_svmlight_file(matrixFileName) self.dictionary = pickle.load(open(dicFileName, "rb")) self.bernoulliNB = BernoulliNB() self.bernoulliNB.fit(self.X, self.Y) self.matrixParser = Parser.MatrixParserForLearning() def classifyOneSentence(self, string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) return self.bernoulliNB.predict(row) else : return None def classifyOneSentenceWithProbability(self,string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) a = self.bernoulliNB.predict_proba(row) return a[0][1] - a[0][0] else : return None
def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias): labels = [] # Will contain all the final labels that result from the voting clf_c1 = MultinomialNB() clf_c1.fit(train_x,train_y) clf_c2 = BernoulliNB() clf_c2.fit(train_x,train_y) clf_f1 = svm.SVC(kernel='linear',cache_size = 512) clf_f1.fit(train_f_x,train_f_y) clf_f2 = svm.SVC(kernel='rbf',cache_size = 512) clf_f2.fit(train_f_x,train_f_y) p1 = clf_c1.predict(test_x) p2 = clf_c2.predict(test_x) p3 = clf_f1.predict(test_f_x) p4 = clf_f2.predict(test_f_x) if bias == 'content': for i in range(len(p1)): if p1[i] == p2[i] or p1[i] == p3[i]: labels.append(p1[i]) else: labels.append(p2[i]) elif bias == "syntax": for i in range(len(p1)): if p1[i] == p3[i] or p1[i] == p4[i]: labels.append(p1[i]) else: labels.append(p3[i]) else: print 'Please enter a valid bias ("syntax" or "content")!' p_combined = np.array(labels) accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y))) return accuracy
def render_content(self): if self.text_source is None: return "No text source selected." from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import BernoulliNB from sklearn import metrics self.dm("creating vectorizer") vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size) data = self.get_column_data(self.text_source) self.dm("using vectorizer") X_train = vectorizer.fit_transform(data) Y_train = self.get_column_data(self.code_source) self.dm("creating classifier") clf = BernoulliNB() clf.fit(X_train, Y_train) accuracy = clf.score(X_train, Y_train) self.dm("predicting") pred = clf.predict(X_train) cm = metrics.confusion_matrix(Y_train, pred) self.dm("displaying result") html_output = "accuracy is " + str(round(accuracy, 2)) html_output += '<pre>'+ str(cm) + '</pre>' return html_output
def bernoulli_classify(): clf = BernoulliNB() traindata = [] traintarget = [] for f in glob.glob("../../../res/articles/training_data/*-articles.json"): target = f.replace("-articles.json", "") target = re.sub(r".*\/+", "", target) output = readWholeFileBernoulli(f, target) traindata.extend(output[0]) traintarget.extend(output[1]) testdata = [] testtarget = [] for f in glob.glob("../../../res/articles/test_data/*-articles.json"): target = f.replace("-articles.json", "") target = re.sub(r".*\/+", "", target) output = readWholeFileBernoulli(f, target) testdata.extend(output[0]) testtarget.extend(output[1]) clf.fit(traindata, traintarget) ncorrect = 0 total = len(testdata) for i in range(len(testdata)): predict = clf.predict(testdata[i]) correct = testtarget[i] if correct == predict[0]: ncorrect += 1 print ("Correct: {0} - Predicted: {1}".format(correct, predict[0])) print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total
def learn_model(data, target): # preparing data for split validation. 80% training, 20% test data_train, data_test, target_train, target_test = cross_validation.train_test_split( data, target, test_size=0.2, random_state=43 ) classifier = BernoulliNB().fit(data_train, target_train) predicted = classifier.predict(data_test) evaluate_model(target_test, predicted)
def naive_bayesB_classifier(X_train, categories, X_test, test_categories): from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB(alpha = 0.10000000000000001).fit(X_train, categories) y_nb_predicted = clf.predict(X_test) print "\n Here is the classification report for Naive Bayes classifier:" print metrics.classification_report(test_categories, y_nb_predicted) print "Accuracy score:" print metrics.accuracy_score(test_categories, y_nb_predicted) to_latex(test_categories, y_nb_predicted)
def learnBModel(ip,label,tst,tst_label): vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(ip.data) X_test = vectorizer.transform(tst.data) tfidf_train = TfidfTransformer(use_idf=False).fit_transform(X_train) tfidf_test = TfidfTransformer(use_idf=False).fit_transform(X_test) classifier = BernoulliNB().fit(tfidf_train,label) predicted_BModel = classifier.predict(tfidf_test) evaluate_model(tst_label,predicted_BModel)
def nb_classifier(self, secret): clf = BernoulliNB() x = self.raw_attr_vector(secret) y = self.get_labels(secret) fsl = self.feature_sel(secret) new_x = fsl.transform(x) clf.fit(new_x, y) new_y = clf.predict(new_x) return clf, fsl, self.evaluate(new_y, y)
def classify(self): ''' using default classifiers to train and test Returns ------- label: the array of predicted result ''' if self.typeStr == 'NB': clf = BernoulliNB() clf.fit(self.X, self.Y) self.y_hat = clf.predict(self.x) elif self.typeStr == 'Tree': clf = tree.DecisionTreeClassifier() clf.fit(self.X, self.Y) self.y_hat = clf.predict(self.x) return self.y_hat
class NaiveBayes: def train(self, x, y, weight = None): self.classifier = BernoulliNB() self.classifier.fit(x, y, sample_weight = weight) def predict(self, x): return self.classifier.predict(x)[0] def newInstance(self): return NaiveBayes() def name(self): return "NaiveBayes"
def evaluate_baseline(): inputs, outputs, words = preprocessing.build_data_target_matrices("aclImdb/train/pos", "aclImdb/train/neg", binary_output=True) tst_inputs, tst_outputs, _ = preprocessing.build_test_data_target_matrices("aclImdb/test/pos", "aclImdb/test/neg", words, binary_output=True) model = BernoulliNB() scores = cross_val_score(model, inputs, outputs.ravel(), cv=10) logging.info("Accuracy for %s: %.02f, std: %.02f" % ("Baseline BernoulliNB", scores.mean(), scores.std())) model.fit(inputs, outputs.ravel()) logging.info(accuracy_score(tst_outputs.ravel(), model.predict(tst_inputs)))
class NaiveBayes(StatModel): def __init__(self): self.name = "nb" self.model = BernoulliNB() def train(self, samples, labels): self.model.fit(samples, labels) def predict(self, samples): return self.model.predict(samples)
def test_BernouliNB(): X = np.random.randint(2, size=(6, 100)) print 'X ' + str(X) Y = np.array([1, 2, 3, 4, 4, 5]) print 'Y ' + str(Y) clf = BernoulliNB() clf.fit(X, Y) for i in xrange(6): pred_ret = clf.predict(X[i]) print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
def learn_model(data,target): # preparing data for split validation. 60% training, 40% test state=randrange(1,23432)+123 print "statue 6857" print state data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.2,random_state=state) classifier = BernoulliNB().fit(data_train,target_train) predicted = classifier.predict(data_test) evaluate_model(target_test,predicted)
def naive_bayes(train,validation): #features season=['Fall','Spring','Summer','Winter'] #season=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] district=['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'] time=['first','second','third'] features2 = [x for x in range(0,24)] Minute=[x for x in range(100,160)] features=district+time+Minute+season+features2 #split set into train, validation train,validation= train_test_split(train, train_size=0.9) model = BernoulliNB() model.fit(train[features],train['Category']) #time calculation stop = timeit.default_timer() print "Runnin time naive bayes is ", stop-start predicted = np.array(model.predict_proba(validation[features])) model1=model.predict(validation[features]) model2=model.predict(train[features]) print "-----------------------------Naive Bayes----------------------------------------------------------------------------" print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro') print "Accuracy is ", accuracy_score(validation['Category'].values.tolist(),model1) print "Training Accuracy is ", accuracy_score(train['Category'].values.tolist(),model2) Category_new=[] for i in range(0,len(model1)): Category_new.append(le_crime.classes_[model1[i]]) #store result into file result=pd.DataFrame(predicted, columns=le_crime.classes_) result['Predicted']=Category_new result.to_csv('naiveBayes_test.csv', index = True, index_label = 'Id' ) #log loss function print "Log loss is", log_loss(validation['Category'],predicted,eps=1e-15, normalize=True, sample_weight=None)
class MyApp(QtGui.QMainWindow, Ui_MainWindow): def __init__(self): self.x_data = list() self.y_data = list() QtGui.QMainWindow.__init__(self) Ui_MainWindow.__init__(self) self.setupUi(self) self.dt_action.clicked.connect(self.ss) self.mm_Button01.clicked.connect(self.ptu01) self.mm_Button02.clicked.connect(self.ptu02) self.roc_Button.clicked.connect(self.pro) self.save_button.clicked.connect(self.out_model) #对标准化radio加入组bg中 self.bg01 = QtGui.QButtonGroup() self.bg01.addButton(self.s_radio_1, 1) self.bg01.addButton(self.s_radio_2, 2) #默认定义s_radio_1这个控件被选中 self.s_radio_1.setChecked(True) #对数据集划分radio加入组bg中 self.bg02 = QtGui.QButtonGroup() self.bg02.addButton(self.d_radio_1, 1) self.bg02.addButton(self.d_radio_2, 2) #默认定义s_radio_1这个控件被选中 self.d_radio_1.setChecked(True) #对三种贝叶斯分类加入组bg中 self.bg03 = QtGui.QButtonGroup() self.bg03.addButton(self.radioButton, 1) self.bg03.addButton(self.radioButton_2, 2) self.bg03.addButton(self.radioButton_3, 3) #默认定义s_radio_1这个控件被选中 self.radioButton.setChecked(True) def ss(self): self.bz() #标准化 self.stt() #划分数据集 self.dtc() #决策树 #数据标准化 def bz(self): if self.bg01.checkedId() == 1: self.x = preprocessing.scale(self.x_data) else: min_max_scaler = preprocessing.MinMaxScaler() self.x = min_max_scaler.fit_transform(self.x_data) #训练数据和测试数据的划分 def stt(self): #对数据进行划分,其中自变量和因变量都进行 #这样就产生四个数据集:x_train,x_test,y_train,y_test self.x_train = list() self.x_test = list() self.y_train = list() self.y_test = list() if self.bg02.checkedId() == 1: strte = self.tt_box.itemText(self.tt_box.currentIndex()) s01 = str(strte).split(':') if len(s01) == 2: xnum = math.ceil((int(s01[0]) * 1.0 / 10) * len(self.x_data)) for i in range(len(self.x_data)): if i <= xnum: self.x_train.append(self.x_data[i]) self.y_train.append(self.y_data[i]) else: self.x_test.append(self.x_data[i]) self.y_test.append(self.y_data[i]) else: ts01 = int(self.train.text()) ts02 = int(self.test.text()) for i in range(ts01 + ts02): if i < ts01: self.x_train.append(self.x_data[i]) self.y_train.append(self.y_data[i]) else: self.x_test.append(self.x_data[i]) self.y_test.append(self.y_data[i]) #计算各种分类评估指标 def cmm(self, cm): ls = dict() for i in range(len(cm)): tmp = list() tp = cm[i][i] fp = sum(cm.T[i]) - tp fn = sum(cm[i]) - tp tn = sum(sum(cm)) - tp - fp - fn #求每个类别对应的评估值 if tp != 0: TPR = float(tp) / (tp + fn) #真正率 else: TPR = 0 if fn != 0: FNR = float(fn) / (fp + tn) #假负率 else: FNR = 0 if fp != 0: FPR = float(fp) / (fp + tn) #假正率 else: FPR = 0 if tn != 0: TNR = float(tn) / (tn + fp) #真负率 else: TNR = 0 tmp.append(TPR) tmp.append(FNR) tmp.append(FPR) tmp.append(TNR) if tp != 0: P = float(tp) / (tp + fp) #精确度 R = float(tp) / (tp + fn) #召回率 F_score = 2 * P * R / (P + R) #查准率和查全率的的调和平均值 else: P = 0 R = 0 F_score = 0 tmp.append(P) tmp.append(R) tmp.append(F_score) ls[self.labels[i]] = tmp return ls ''' 对每个参数的值判断是否为空 ''' def para(self): # 对每个参数的值判断是否为空 # M_alpha参数的设置 if not self.mal_edit.text().strip(): self.m_alpha = 1.0 else: self.m_alpha = float(self.mal_edit.text()) # M_fit_prior参数的设置 if self.mfp_box.itemText(self.mfp_box.currentIndex()) == 'False': self.m_fit_prior = False else: self.m_fit_prior = True # B_alpha参数的设置 if not self.bal_edit.text().strip(): self.b_alpha = 1.0 else: self.b_alpha = float(self.bal_edit.text()) # binarize参数的设置 if not self.bi_edit.text().strip(): self.binarize = None else: self.binarize = float(self.bi_edit.text()) # fit_prior参数的设置 if self.bfp_box.itemText(self.bfp_box.currentIndex()) == 'False': self.b_fit_prior = False else: self.b_fit_prior = True ''' 主函数 ''' def dtc(self): self.para() #获取所有参数 #将y转化为一维形式:self.y_train,self.y_test self.y01_train = list() self.y01_test = list() for a in range(len(self.y_train)): self.y01_train.append(self.y_train[a][0]) for b in range(len(self.y_test)): self.y01_test.append(self.y_test[b][0]) #取出其中labels self.labels = list() for c in range(len(self.y_test)): if self.labels.count(self.y_test[c][0]) == 0: self.labels.append(self.y_test[c][0]) print(self.labels) ''' bayes算法实现 ''' if self.bg03.checkedId() == 1: self.clf = GaussianNB() elif self.bg03.checkedId() == 2: self.clf = MultinomialNB(alpha=self.m_alpha, fit_prior=self.m_fit_prior) else: self.clf = BernoulliNB(alpha=self.b_alpha, binarize=self.binarize, fit_prior=self.b_fit_prior) self.clf.fit(self.x_train, self.y01_train) self.y_pred = self.clf.predict(self.x_test) self.x_pred = self.clf.predict(self.x_train) ''' 该模块是对dtable01模块进行设置,即显示训练集的训练结果 ''' #设置单元格的行数和列数 self.dtable01.setRowCount(3 * len(self.labels)) self.dtable01.setColumnCount(8) lab = [ '真正率(TPR)', '假负率(FNR)', '假正率(FPR)', '真负率(TNR)', '精确度(PRE)', '召回率(REC)', 'F-SCORE' ] #训练数据 xcm = confusion_matrix(self.y01_train, self.x_pred) self.train_precision = 0.0 for i in range(len(xcm)): self.train_precision += xcm[i][i] self.train_precision = self.train_precision / sum(sum(xcm)) print(xcm) #测试数据 tcm = confusion_matrix(self.y01_test, self.y_pred) self.test_precision = 0.0 for i in range(len(tcm)): self.test_precision += tcm[i][i] self.test_precision = self.test_precision / sum(sum(tcm)) print(tcm) #求出每个类别labels作为正样本的TP,FP,FN,TN,以字典的形式存储 xls = self.cmm(xcm) tls = self.cmm(tcm) #计算训练 每个类别labels对应的评估值 num = 0 for key in xls: tmp01 = xls[key] tmp02 = tls[key] mlan = "类别:" + str(key) self.dtable01.setItem(num, 0, QtGui.QTableWidgetItem(mlan.decode('utf-8'))) self.dtable01.setItem(num + 1, 0, QtGui.QTableWidgetItem('train')) self.dtable01.setItem(num + 2, 0, QtGui.QTableWidgetItem('test')) for j in range(len(tmp01)): self.dtable01.setItem( num, j + 1, QtGui.QTableWidgetItem(lab[j].decode('utf-8'))) self.dtable01.setItem( num + 1, j + 1, QtGui.QTableWidgetItem(str(round(tmp01[j], 2)))) self.dtable01.setItem( num + 2, j + 1, QtGui.QTableWidgetItem(str(round(tmp02[j], 2)))) num = num + 3 ''' 该模块是对dtable02模块进行设置,即显示训练集的训练结果 ''' #设置单元格的行数和列数 self.dtable02.setRowCount(len(self.x_pred)) self.dtable02.setColumnCount(2) self.dtable02.setHorizontalHeaderLabels(['real', 'pred']) for s in range(len(self.x_pred)): if self.y01_train[s] == self.x_pred[s]: self.dtable02.setItem( s, 0, QtGui.QTableWidgetItem(str(self.y01_train[s]))) self.dtable02.setItem( s, 1, QtGui.QTableWidgetItem(str(self.x_pred[s]))) self.dtable02.item(s, 0).setBackgroundColor( QtGui.QColor(214, 71, 0)) self.dtable02.item(s, 1).setBackgroundColor( QtGui.QColor(214, 71, 0)) else: self.dtable02.setItem( s, 0, QtGui.QTableWidgetItem(str(self.y01_train[s]))) self.dtable02.setItem( s, 1, QtGui.QTableWidgetItem(str(self.x_pred[s]))) ''' 该模块是对dtable03模块进行设置,显示测试集的测试结果 ''' #设置单元格的行数和列数 self.dtable03.setRowCount(len(self.y_pred)) self.dtable03.setColumnCount(2) self.dtable03.setHorizontalHeaderLabels(['real', 'pred']) for s in range(len(self.y_pred)): if self.y01_test[s] == self.y_pred[s]: self.dtable03.setItem( s, 0, QtGui.QTableWidgetItem(str(self.y01_test[s]))) self.dtable03.setItem( s, 1, QtGui.QTableWidgetItem(str(self.y_pred[s]))) self.dtable03.item(s, 0).setBackgroundColor( QtGui.QColor(214, 71, 0)) self.dtable03.item(s, 1).setBackgroundColor( QtGui.QColor(214, 71, 0)) else: self.dtable03.setItem( s, 0, QtGui.QTableWidgetItem(str(self.y01_test[s]))) self.dtable03.setItem( s, 1, QtGui.QTableWidgetItem(str(self.y_pred[s]))) ''' 该模块是对train_e模块进行设置,显示测试集的测试结果 ''' self.train_e.setText(str(round(self.train_precision, 3))) self.test_e.setText(str(round(self.test_precision, 3))) #保存模型 def out_model(self): self.filepath = str( QtGui.QFileDialog.getSaveFileName(self, "文件保存", "F:/", "Model Files (*.model)")) joblib.dump(self.clf, self.filepath.decode('GB2312')) #对应mm_Button01的函数 def ptu01(self): #画图train混淆矩阵窗口 mm = mm_matrix.c_matrix() mm.labels = self.labels mm.y_true = self.y01_train mm.y_pred = self.x_pred mm.p_tu() #对应mm_Button02的函数 def ptu02(self): #画图train混淆矩阵窗口 mm = mm_matrix.c_matrix() mm.labels = self.labels mm.y_true = self.y01_test mm.y_pred = self.y_pred mm.p_tu() #对应roc_Button的函数 def pro(self): p_roc = proc_s.proc() p_roc.y_true = self.y01_test p_roc.y_pred = self.y_pred p_roc.labels = self.labels p_roc.mroc()
def main(): data = [] with open('data-1_train.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: data.append(row) fields = data[0] data = np.array(data[1:], dtype=object) print(data.shape, fields) words = filterData(data) print(words.shape) x_train = [] y_train = [] for i in range(len(data)): x_train.append(words[i][1]) y_train.append(data[i][4]) x_train = np.array(x_train) y_train = np.array(y_train) # 10-Fold Cross Validation kf = KFold(n_splits=10) kf.get_n_splits(x_train) precision_nb = np.array([0.0, 0.0]) recall_nb = np.array([0.0, 0.0]) f_score_nb = np.array([0.0, 0.0]) precision_svm = np.array([0.0, 0.0]) recall_svm = np.array([0.0, 0.0]) f_score_svm = np.array([0.0, 0.0]) count = 1 for train_index, test_index in kf.split(x_train): x_train_kf, x_test_kf = x_train[train_index], x_train[test_index] y_train_kf, y_test_kf = y_train[train_index], y_train[test_index] onehot_enc = MultiLabelBinarizer() onehot_enc.fit(x_train) bnbc = BernoulliNB(binarize=None) bnbc.fit(onehot_enc.transform(x_train_kf), y_train_kf) predicted_y = bnbc.predict(onehot_enc.transform(x_test_kf)) print(onehot_enc.transform(x_test_kf)) print(onehot_enc.transform(x_test_kf).shape) print('length of predicted', len(predicted_y)) score = bnbc.score(onehot_enc.transform(x_test_kf), y_test_kf) precision_nb += precision_score(y_test_kf, predicted_y, labels=['-1', '1'], average=None) recall_nb += recall_score(y_test_kf, predicted_y, labels=['-1', '1'], average=None) f_score_nb += f1_score(y_test_kf, predicted_y, labels=['-1', '1'], average=None) print(count, "Naive Bayesian Accuracy: ", score) # print(bnbc.predict(onehot_enc.transform(x_test_kf))) lsvm = LinearSVC() lsvm.fit(onehot_enc.transform(x_train_kf), y_train_kf) predicted_y = lsvm.predict(onehot_enc.transform(x_test_kf)) precision_svm += precision_score(y_test_kf, predicted_y, labels=['-1', '1'], average=None) recall_svm += recall_score(y_test_kf, predicted_y, labels=['-1', '1'], average=None) f_score_svm += f1_score(y_test_kf, predicted_y, labels=['-1', '1'], average=None) score = lsvm.score(onehot_enc.transform(x_test_kf), y_test_kf) print(count, "Linear SVM Accuracy: ", score) print("") count += 1 print('NB Avg. Precisions', precision_nb / 10) print('NB Avg. Recalls', recall_nb / 10) print('NB Avg. F-Scores', f_score_nb / 10) print('SVM Avg. Precisions', precision_svm / 10) print('SVM Avg. Recalls', recall_svm / 10) print('SVM Avg. F-Scores', f_score_svm / 10) # Neural network batch_size = 300 tf.reset_default_graph() vocab_len = len(onehot_enc.classes_) inputs_ = tf.placeholder(dtype=tf.float32, shape=[None, vocab_len], name="inputs") targets_ = tf.placeholder(dtype=tf.float32, shape=[None, 3], name="targets") h1 = tf.layers.dense(inputs_, 500, activation=tf.nn.relu) logits = tf.layers.dense(h1, 3, activation=None) output = tf.nn.sigmoid(logits) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets_)) optimizer = tf.train.AdamOptimizer(0.001).minimize(loss) correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(targets_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy') sess = tf.Session() sess.run(tf.global_variables_initializer()) for train_index, test_index in kf.split(x_train): x_train_kf, x_test_kf = x_train[train_index], x_train[test_index] y_train_kf, y_test_kf = y_train[train_index], y_train[test_index] for i in range(10): for x_batch, y_batch in get_batch(onehot_enc.transform(x_train_kf), label2bool(y_train_kf), batch_size): loss_value, _ = sess.run([loss, optimizer], feed_dict={ inputs_: x_batch, targets_: y_batch }) test_acc = sess.run(accuracy, feed_dict={ inputs_: onehot_enc.transform(x_test_kf), targets_: label2bool(y_test_kf) }) print("Test Accuracy: {}".format(test_acc))
important_words.sort(reverse=True) # In[245]: important_words # ## Report the classification accuracy and confusion matrix. Inspecting the weight vector of the logistic regression, what are the words that play the most important roles in deciding the sentiment of the reviews? # In[246]: clf = BernoulliNB() # In[247]: clf.fit(traindata_x, trainlabels_x) k1 = clf.predict(traindata_x) sklearn.metrics.accuracy_score(trainlabels_x, k1, normalize=True, sample_weight=None) # In[248]: clf.fit(traindata_y, trainlabels_y) k2 = clf.predict(traindata_y) sklearn.metrics.accuracy_score(trainlabels_y, k2, normalize=True, sample_weight=None) # In[249]:
# training data set in a separate line 1 for labels train_labels = [line[1] for line in lines] # print(train_labels) # use scikit learn to vectorize from sklearn.feature_extraction.text import CountVectorizer # initiate counter vectorizer in sklearn count_vectorizer = CountVectorizer(binary='true') # use counter vectorizer fit transform to convert the training document # into tuples of number which represents the frequency of words train_documents = count_vectorizer.fit_transform(train_documents) # print(train_documents) # will print only elements that are 1 and will leave all the 0's # Training phase from sklearn.naive_bayes import BernoulliNB classifier = BernoulliNB().fit(train_documents, train_labels) # Testing phase testing = classifier.predict( count_vectorizer.transform(["this is the worst movie"])) print(testing)
y_valid = dataset["yvalidate"] print("d = %d" % X.shape[1]) print("n = %d" % X.shape[0]) print("t = %d" % X_valid.shape[0]) print("Num classes = %d" % len(np.unique(y))) model = NaiveBayes(num_classes=4) model.fit(X, y) y_pred = model.predict(X_valid) v_error = np.mean(y_pred != y_valid) print("Naive Bayes (ours) validation error: %.3f" % v_error) model2 = BernoulliNB() model2.fit(X, y) y_pred = model2.predict(X_valid) v_error2 = np.mean(y_pred != y_valid) print("Scikit learn bernouilli nb validation error: %.3f" % v_error2) elif question == '3': with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f: dataset = pickle.load(f) X = dataset['X'] y = dataset['y'] Xtest = dataset['Xtest'] ytest = dataset['ytest'] for k in [1, 3, 10]: model = KNN(k) model.fit(X, y) y_pred = model.predict(Xtest)
#clf1.predict([[33,57,57,84,57,84,84,58,33,68,68,68,68,68,68,68,68,68,57,89],[0,0,0,0,57,11,37,37,37,37,37,84,57,37,37,37,37,37,37,37]]) A = [ 33, 57, 57, 84, 57, 84, 84, 58, 33, 68, 68, 68, 68, 68, 68, 68, 68, 68, 57, 89 ] B = [ 0, 0, 0, 0, 57, 11, 37, 37, 37, 37, 37, 84, 57, 37, 37, 37, 37, 37, 37, 37 ] #print('Size of A=',len(A)) #print('Size of B=',len(B)) for i in range((len(A)), 198): A.append(0) for i in range((len(B)), 198): B.append(0) #print('Size of A=',len(A)) clf1.predict([A, B]) # In[42]: clf3 = MultinomialNB() clf3.fit(tennis_data, output_class) #clf3.predict([[33,57,57,84,57,84,84,58,33,68,68,68,68,68,68,68,68,68,57,89],[0,0,0,0,57,11,37,37,37,37,37,84,57,37,37,37,37,37,37,37]]) A = [ 33, 57, 57, 84, 57, 84, 84, 58, 33, 68, 68, 68, 68, 68, 68, 68, 68, 68, 57, 89 ] B = [ 0, 0, 0, 0, 57, 11, 37, 37, 37, 37, 37, 84, 57, 37, 37, 37, 37, 37, 37, 37 ] #print('Size of A=',len(A)) #print('Size of B=',len(B))
trainData = pd.read_table('./dataset1/train.txt', header=None, encoding='gb2312', delim_whitespace=True) testData = pd.read_table('./dataset1/test.txt', header=None, encoding='gb2312', delim_whitespace=True) trainLabel = np.array(trainData.pop(3)) trainData = np.array(trainData) testLabel = np.array(testData.pop(3)) testData = np.array(testData) time_start1 = time.time() clf1 = BayesClassifier() clf1.train(trainData, trainLabel) clf1.predict(testData) score1 = clf1.accuarcy(testLabel) time_end1 = time.time() print("Accuracy of self-Bayes: %f" % score1) print("Runtime of self-Bayes:", time_end1 - time_start1) time_start = time.time() clf = BernoulliNB() clf.fit(trainData, trainLabel) clf.predict(testData) score = clf.score(testData, testLabel, sample_weight=None) time_end = time.time() print("Accuracy of sklearn-Bayes: %f" % score) print("Runtime of sklearn-Bayes:", time_end - time_start)
from sklearn import cross_validation from sklearn.metrics import confusion_matrix #Somente o nome do arquivo if __name__ == '__main__': for file in glob.glob(sys.argv[1] + '*.mat'): data = scipy.io.loadmat(file) #print("\nTreinando Naive Bayes...") clf = BernoulliNB(alpha=0.2) ytrain = data['Ytrain'].T.reshape(data['Ytrain'].shape[1]) Xtrain = data['Xtrain'] Xval = data['Xval'] clf.fit(Xtrain, ytrain) predict = clf.predict(Xval) yVal = data['Yval'].T.reshape(data['Yval'].shape[1]) print "\nAcuracia: ", accuracy_score(yVal, predict) X_train = data["Xtrain"] X_val = data["Xval"] cm = confusion_matrix(yVal, predict) total = numpy.sum(cm, axis=1) if (cm.shape[0] < 2): acc = 1.0 else: acc = [] for i in range(total.shape[0]): if (total[i] > 0):
stop_words=stop_words(), #ngram_range=(1,2), #max_features=4000 ) X.columns from sklearn.naive_bayes import BernoulliNB from sklearn.model_selection import train_test_split # Split train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # My first Naive Bayes classifier! clf = BernoulliNB() clf.fit(X_train, y_train) prediction = clf.predict(X_test) print(np.mean([prediction == y_test])) # 1 Train the classifier clf = BernoulliNB() clf.fit(X_train, y_train) # 2 Predict the data (We need to tokenize the data using the same vectorizer object) X_test = vectorizer.transform(test_df['text']).toarray() prediction = clf.predict(X_test) # 3 Create a the results file output = pd.DataFrame({'Party': prediction}) output.index.name = 'Id' output.to_csv('sample_submission.csv')
random_state=0) # Scaling sc = StandardScaler() X_train = sc.fit_transform(x_train) X_test = sc.transform(x_test) # Model Building # Chose the Bernoulli distribution algorithm to predict the binary(could be also say boolean) values like female or male b_nb = BernoulliNB() b_nb.fit(X_train, y_train.ravel()) y_pred = b_nb.predict(X_test) # making prediction as always """ Naive Bayes -> Bernoulli Naive Bayes : The naive Bayes training and classification algorithms for data that is distributed according to multivariate Bernoulli distributions. there may be multiple features but each one is assumed to be a binary-valued (Bernoulli, boolean) variable. Gaussian Naive Bayes : If your data is increasing continuously you can implement the Gaussian Naive Bayes algorithm for classification. Multinomial Naive Bayes : The naive Bayes algorithm for multinomially distributed data, and is one of the two classic naive Bayes variants used in text classification (where the data are typically represented as word vector counts, although tf-idf vectors are also known to work well in practice).
def main(argv): start_time = time.time() print "running main()" bowfile = '' clsfile = '' tstfile = '' # Parse arguments try: opts, args = getopt.getopt(argv,"b:c:t:T:k:a:",["bow=","cls=","tst=","tstcls=","k=","alpha="]) except getopt.GetoptError: print 'Usage: \n python naiveBayes.py -b <bagofwords_csv> -c <classes_txt> -t <tst_bagofwords_csv> -T <tst_classes_txt> -k <kth_best> -a <alpha>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'Usage: \n python naiveBayes.py -b <bagofwords_csv> -c <classes_txt> -t <tst_bagofwords_csv> -T <tst_classes_txt> -k <kth_best> -a <alpha>' sys.exit() elif opt in ("-b", "--bow"): bowfile = arg elif opt in ("-c", "--cls"): clsfile = arg elif opt in ("-t", "--tst"): tstfile = arg elif opt in ("-T", "--tstcls"): tstclsfile = arg elif opt in ("-k", "--k"): k = int(arg) elif opt in ("-a", "--alpha"): alpha = float(arg) # Get bag of words array and sentiment array bow = read_bagofwords_dat(bowfile) cls = read_class_values_dat(clsfile) tst = read_bagofwords_dat(tstfile) tstcls = read_class_values_dat(tstclsfile) # Standard model model = BernoulliNB() model.fit(bow, cls) # Predict test set predict = model.predict_proba(tst) classes = model.predict(tst) # Write to file write_probs_to_file("./BNBbi_standard_probs.txt", predict) write_classes_to_file("./BNBbi_standard_classes.txt", classes) # KBest model sel = SelectKBest(f_classif, k) sel.fit(bow, cls) bowmod = sel.transform(bow) tstmod = sel.transform(tst) varmodel = BernoulliNB() varmodel.fit(bowmod, cls) varpredict = varmodel.predict_proba(tstmod) varclasses = varmodel.predict(tstmod) # Write to file write_probs_to_file("./BNBbi_kbest_probs.txt", varpredict) write_classes_to_file("./BNBbi_kbest_classes.txt", varclasses) # FPR model sel = SelectFpr(f_classif, alpha) sel.fit(bow, cls) bowmod = sel.transform(bow) tstmod = sel.transform(tst) varmodel = BernoulliNB() varmodel.fit(bowmod, cls) varpredict = varmodel.predict_proba(tstmod) varclasses = varmodel.predict(tstmod) # Write to file write_probs_to_file("./BNBbi_fpr_probs.txt", varpredict) write_classes_to_file("./BNBbi_fpr_classes.txt", varclasses) # Runtime print 'Runtime:', str(time.time() - start_time)
def create_and_save_model(): # In[3]: data = pd.read_csv('character-predictions_pose.csv') data4 = pd.read_csv('uci-news-aggregator.csv') # In[4]: # to avoid 'Could not convert string to float on dataset' error for column in data.columns: le = LabelEncoder() data[column] = le.fit_transform(data[column].astype(str)) if data[column].dtype == type(object): data[column] = le.fit_transform( data[column]) # Fit label encoder and return encoded labels for column in data4.columns: le = LabelEncoder() data4[column] = le.fit_transform(data4[column].astype(str)) if data4[column].dtype == type(object): data4[column] = le.fit_transform( data4[column]) # Fit label encoder and return encoded labels # In[5]: x = data.drop('isAlive', axis=1) y = data['isAlive'] x2 = data4.drop('CATEGORY', axis=1) y2 = data4['CATEGORY'] # In[6]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2) # In[7]: gnb1 = GaussianNB() gnb2 = MultinomialNB() gnb3 = BernoulliNB() gnb1.fit(x_train, y_train) y_pred = gnb1.predict(x_test) print(gnb1.score(x_test, y_test)) # In[8]: gnb2.fit(x2_train, y2_train) y2_pred = gnb2.predict(x2_test) print(gnb2.score(x2_test, y2_test)) # In[9]: gnb3.fit(x_train, y_train) y_pred = gnb3.predict(x_test) print(gnb3.score(x_test, y_test)) gnb3.fit(x2_train, y2_train) y3_pred = gnb3.predict(x2_test) print(gnb3.score(x2_test, y2_test)) y3_pred # In[17]: # "DROP TABLE models;" data = pd.read_csv('character-predictions_pose.csv') data4 = pd.read_csv('uci-news-aggregator.csv') x = data.drop('isAlive', axis=1) y = data['isAlive'] x2 = data4.drop('CATEGORY', axis=1) y2 = data4['CATEGORY'] # In[6]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2) database = "/Users/ziba/PycharmProjects/try6/realtime/database.db" sql_create_models_table = """CREATE TABLE IF NOT EXISTS models( id integer PRIMARY KEY, name text NOT NULL, model BLOB NOT NULL);""" conn = sqlite3.connect(database) # create models table cur = conn.cursor() cur.execute(sql_create_models_table) # Here we force pickle to use the efficient binary protocol # (protocol=2). This means you absolutely must use an SQLite BLOB field # and make sure you use sqlite3.Binary() to bind a BLOB parameter. modelName1 = "Gaussian" modelName2 = "Multinomial" modelName3 = "Bernoulli" cur.execute("insert into models(name,model) values (?,?)", ( modelName1, sqlite3.Binary(pickle.dumps(gnb1, protocol=2)), )) cur.execute("insert into models(name,model) values (?,?)", ( modelName2, sqlite3.Binary(pickle.dumps(gnb2, protocol=2)), )) cur.execute("insert into models(name,model) values (?,?)", ( modelName3, sqlite3.Binary(pickle.dumps(gnb3, protocol=2)), )) # If we use old pickle protocol (protocol=0, which is also the default), # we get away with sending ASCII bytestrings to SQLite. # cur.execute("insert into models(name,model) values (?,?)", (pickle.dumps(gnb1, protocol=0),)) # Fetch the BLOBs back from SQLite nameOfModel = "Gaussian" cur.execute("select model from models") # cur.execute("SELECT model FROM models WHERE name=?", (nameOfModel,)) for row in cur: serializedModel = row[0] # Deserialize the BLOB to a Python object - # pickle.loads() needs a # bytestring. loadedModel = pickle.loads(serializedModel) # (str(serialized_point)) # print("got model back from database", loadedModel) y_pred = loadedModel.predict(x2_test) return y_pred
param_grid = [{ 'alpha': alpha_list, }] clf = GridSearchCV(MultinomialNB(), param_grid, cv=5) clf.fit(traindata, trainlabel) print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_)) # In[28]: #bernoulli朴素贝叶斯 from sklearn.naive_bayes import BernoulliNB ber_model = BernoulliNB(alpha=0.001) ber_model.fit(traindata, trainlabel) ber_predict = ber_model.predict(testdata) print("bernoulli贝叶斯文本分类的准确率为:", metrics.accuracy_score(ber_predict, testlabel)) # In[29]: #高斯贝叶斯分类器 gauss_model = GaussianNB() gauss_model.fit(traindata.toarray(), trainlabel) gauss_predict = ber_model.predict(testdata.toarray()) print("GaussianNB贝叶斯文本分类的准确率为:", metrics.accuracy_score(gauss_predict, testlabel)) # In[1]: # 创建文件 import os
def nb(): naive_bayes = BernoulliNB() naive_bayes.fit(train_x, train_y) return accuracy( test_y, naive_bayes.predict(test_x)), naive_bayes.predict_proba(test_x), test_y
# alright guess i'll just make my own grid search style thing also with test train split looped in import sklearn.cross_validation k_fold = KFold(len(y_train), n_folds=10, shuffle=True, random_state=123) model = BernoulliNB(alpha=1) print cross_val_score(clf, X, y, cv=k_fold, n_jobs=1) # going by best AUC the winner is "the default" which is alpha =1 and binarize = 0, model = BernoulliNB(alpha=1) # Train the model using the training sets model.fit(x_train_sm, y_train_sm.ravel()) #prediction prediction = model.predict(x_test) #Metrics print( "\n\n Welcome to Naive Bayes. \n It 'Naively' assumes independance between variables. \n It's best feature is being very quick and relatively easy to make.\n Used mostly in text classification and reccomender systems \n and here we can see that it is awful \n\n" ) print("CONFUSION MATRIX: \n", skmet.confusion_matrix(y_test, prediction)) print("\n CLASSIFICATION REPORT:\n\n", skmet.classification_report(y_test, prediction)) print('ACCURACY -> ', round(100 * skmet.accuracy_score(y_test, prediction), 2), '%') print("recall:", skmet.recall_score(y_test, prediction)) print("precision:", skmet.precision_score(y_test, prediction)) print("f1_score:", skmet.f1_score(y_test, prediction))
x, y = l.split(' ') Y_test.append(y) temp = open(path4 + x, 'r') temp = temp.read() X_test.append(temp) x_train = X_train y_train = Y_train x_test = X_test y_test = Y_test print("bow initiated") vect = fe.text.CountVectorizer(max_features=2000) X_train_dtm = vect.fit_transform(x_train) # pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names()) X_test_dtm = vect.transform(x_test) # pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names()) tf_trans = fe.text.TfidfTransformer() X_train_tfidf = tf_trans.fit_transform(X_train_dtm) X_test_tfidf = tf_trans.transform(X_test_dtm) # creating and training logistic regression model print("training begins") BNBC = BernoulliNB() BNBC.fit(X_train_tfidf, y_train) print("test begins") y_predicted = BNBC.predict(X_test_tfidf) print(ac(y_test, y_predicted))
Gauss.fit(x_train, y_train) # In[11]: y_predicted_g = Gauss.predict(x_test) print(mean_squared_error(y_predicted_g, y_test)) print(y_test.values[1]) # In[12]: Bern = BernoulliNB() Bern.fit(x_train, y_train) # In[13]: y_predicted_b = Bern.predict(x_test) print(mean_squared_error(y_predicted_b, y_test)) # In[14]: Mult = MultinomialNB() Mult.fit(x_train, y_train) # In[15]: y_predicted_m = Mult.predict(x_test) print(mean_squared_error(y_predicted_m, y_test)) # In[16]: g = 0
X_train = matr[0:150, 1:] y_train = matr[0:150, 0] X_test = matr[150:194, 1:] y_test = matr[150:194, 0] from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_all = scaler.fit_transform(X_all) from sklearn.naive_bayes import BernoulliNB model = BernoulliNB() model.fit(X_train, y_train) expected = y_test predicted = model.predict(X_test) print predicted # Import the random forest package from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False,
# In[108]: bnb = BernoulliNB() # In[109]: bnb.fit(X_train, y_train) # In[110]: y_pred = bnb.predict(X_test) # In[111]: confusion_matrix(y_test, y_pred) # In[112]: pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True) # In[113]:
verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search print(rnds.best_score_) print('\nalpha: ') print(rnds.best_estimator_.alpha) # apply best parameters nbb = BernoulliNB(alpha=rnds.best_estimator_.alpha) nbb.fit(X_train_pca,y_train) pred = nbb.predict(X_test_pca) pred_train = nbb.predict(X_train_pca) if learner: # Random Forest from sklearn.ensemble import RandomForestClassifier min_samples_leaf_r = np.round(np.linspace(1, 80, 30)) min_samples_leaf_range = min_samples_leaf_r.astype(int) max_depth_range = np.round(np.linspace(5, 15, 30)) param_dist = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) rnds = EvolutionaryAlgorithmSearchCV( estimator=RandomForestClassifier(n_estimators=(1+num_features/2)), params=param_dist, scoring="f1", cv=cv, verbose=1,
plt.plot(fprTREE, tprTREE, '', label="Flight, auc= %0.2f" % aucTREE) plt.title('Receiver Operating Characteristic') plt.xlabel('False Positive') plt.ylabel('True Positive') plt.legend(loc=4) plt.show() ''' ######################################## NAIVE BAYES ######################################## ''' from sklearn.naive_bayes import BernoulliNB # importação do algoritmo e sua classe BernoulliNB classificadorNB = BernoulliNB() classificadorNB.fit( previsores_treinamento, classe_treinamento) #treina o algoritmo(cria a tabela de probabilidade) previsoesNB = classificadorNB.predict( previsores_teste) # Testamos os dados para achar sua taxa de acerto #Retorna a precisão média nos dados e rótulos de teste fornecidos. print("Best test score is {}".format( classificadorNB.score(previsores_teste, classe_teste))) #Retorna a precisão média nos dados e rótulos de treinamento fornecidos. print("Best training score is {}".format( classificadorNB.score(previsores_treinamento, classe_treinamento))) # Metrica que usar valores de precisão e recall f1NB = f1_score(classe_teste, previsoesNB, average='micro') # Cria uma matriz para comparação de dados dos dois atributos matrizNB = confusion_matrix(classe_teste, previsoesNB) #resultado da avaliação cruzada feita com 3 testes. k=3 resultado_cvNB = cross_val_score(classificadorNB, previsores, classe, cv=3)
def BernoulliNB_Text(X, Y, testcase): clf = BernoulliNB() clf.fit(X, Y) BernoulliNB(alpha=1.0, class_prior=None, fit_prior=True) result = clf.predict(testcase) return result
def document_features(document): document_words = set(document) features = [1 if word in document_words else 0 for word in word_features] return features train_features = [document_features(d) for (d,c) in train_docs] # creates feature sets with features and classes train_labels = [c for (d,c) in train_docs] test_features = [document_features(d) for (d,c) in test_docs] # creates feature sets with features and classes test_labels = [c for (d,c) in test_docs] #%% TRAIN CLASSIFIER ## Bernoulli NB clf_country = BernoulliNB() clf_country.fit(train_features, train_labels) test_predictions = clf_country.predict(test_features) accuracy = np.mean(test_predictions == test_labels) print("Accuracy: %f" % (accuracy)) print(classification_report(test_labels, test_predictions)) #%% trying with MULTINOMIAL NAIVE BAYES from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import accuracy_score, confusion_matrix
u'\ud83c[\udf00-\udfff]|' u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' u'[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) twt = myre.sub(r'', twt) twt = re.sub('[\s]+', ' ', twt) # remove additional white spaces return twt outtweets = (processTweet(texts.encode("utf-8"))) #outtweets = unicode(outtweets, 'utf8') print(outtweets) test_vectors = vectorizer.transform([outtweets]) prediction_nb = classifier_nb.predict(test_vectors) prediction_rf = classifier_rf.predict(test_vectors) prediction_sgd = classifier_sgd.predict(test_vectors) prediction_svm = classifier_svm.predict(test_vectors) prediction_dt = classifier_dt.predict(test_vectors) #if prediction_nb==1 or prediction_svm==1 or prediction_nb==1 or prediction_rf==1 or prediction_sgd: test_vectors2 = vectorizer2.transform([outtweets]) prediction_nb2 = classifier_nb2.predict(test_vectors2) prediction_rf2 = classifier_rf2.predict(test_vectors2) prediction_sgd2 = classifier_sgd2.predict(test_vectors2) prediction_svm2 = classifier_svm2.predict(test_vectors2) prediction_dt2 = classifier_dt2.predict(test_vectors2) #NB
########################### bnb(BernoulliNB) ################################ print('BernoulliNB stacking') stack_train = np.zeros((train_feature.shape[0], number)) stack_test = np.zeros((test_feature.shape[0], number)) score_va = 0 for i, (tr, va) in enumerate(kfold.split(train_feature, score)): print('stack:%d/%d' % ((i + 1), n_folds)) bnb = BernoulliNB() bnb.fit(train_feature[tr], score[tr]) score_va = bnb.predict_proba(train_feature[va]) score_te = bnb.predict_proba(test_feature) print(score_va) print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va])))) stack_train[va] += score_va stack_test += score_te stack_test /= n_folds stack = np.vstack([stack_train, stack_test]) df_stack = pd.DataFrame() for i in range(stack.shape[1]): df_stack['advertiser_id_tfidf_bnb_classfiy_{}'.format(i)] = np.around( stack[:, i], 6) df_stack.to_csv(path + 'feature/advertiser_id_tfidf_bnb_error_single_classfiy.csv', index=None, encoding='utf8') print('BernoulliNB特征已保存\n') ########################### mnb(MultinomialNB) ################################
# Predicted Fact Predicted Opinion Total # Actual Fact: 1007 114 1121 # Actual Optinion: 102 1008 1110 # 1109 1122 #============================================================================== # NAIVE BAYES #============================================================================== from sklearn.naive_bayes import BernoulliNB nb_classifier, precision, recall, accuracy, f1 = test_classifier( train[features], y_train, test[features], test['y_label'], BernoulliNB()) nb_cv_scores = cv(BernoulliNB(), data[features], data['y_label']) nb_classifier = BernoulliNB().fit(train[features], train['y_label']) nb_preds_epos = nb_classifier.predict(test[features]) nb_cm = metrics.confusion_matrix(test['y_label'], nb_preds_epos) print_cm(nb_cm) # Predicted Fact Predicted Opinion Total # Actual Fact: 895 226 1121 # Actual Optinion: 77 1033 1110 # 972 1259 #============================================================================== # SUPPORT VECTOR MACHINE #============================================================================== from sklearn import svm svm_classifier = svm.SVC() svm_classifier, precision, recall, accuracy, f1 = test_classifier(
def Q3(): """ 3. Perform classification of ‘Ecoli’ dataset using \ all Naive Bayes / KNN / Decision Tree / Random Forests / SVM methods. \ (15 points) A. Which of the methods show the best performance \ in terms of accuracy? Explain why \ (use 5-fold cross validation and report averaged performance). \ (10 points) B. In random forests model, list THREE most important attributes \ in classifying localization site. (5 points) Attribute Information: 1. Sequence Name: Accession number for the SWISS-PROT database 2. mcg: McGeoch's method for signal sequence recognition. 3. gvh: von Heijne's method for signal sequence recognition. 4. lip: von Heijne's Signal Peptidase II consensus sequence score. \ Binary attribute. 5. chg: Presence of charge on N-terminus of predicted lipoproteins. \ Binary attribute. 6. aac: score of discriminant analysis of the amino acid content of \ outer membrane and periplasmic proteins. 7. alm1: score of the ALOM membrane spanning region prediction program. 8. alm2: score of ALOM program after excluding \ putative cleavable signal regions from the sequence. (9. protein localization sites) """ f3 = "ecoli.data" df = pd.read_csv( f3, delim_whitespace=True, header=None, names=[ "Sequence Name", "mcg", "gvh", "lip", "chg", "aac", "alm1", "alm2", "protein localization sites", ], ) # print(df.head()) # print(df.describe()) df_x = df.iloc[:, 1:-1] df_y = df.iloc[:, -1] kf = KFold(n_splits=5, shuffle=True) accuracy_score_list_GNB = [] accuracy_score_list_BNB = [] accuracy_score_list_MNB = [] accuracy_score_list_KNN = [] accuracy_score_list_DT = [] accuracy_score_list_RF = [] accuracy_score_list_SVM = [] features = df_x.columns for idx_train, idx_test in kf.split(df_x): x_train, x_test = df_x.to_numpy()[idx_train], df_x.to_numpy()[idx_test] y_train, y_test = df_y.to_numpy()[idx_train], df_y.to_numpy()[idx_test] GNB_model = GaussianNB().fit(x_train, y_train) y_GNB = GNB_model.predict(x_test) BNB_model = BernoulliNB().fit(x_train, y_train) y_BNB = BNB_model.predict(x_test) MNB_model = MultinomialNB().fit(x_train, y_train) y_MNB = MNB_model.predict(x_test) KNN_model = KNeighborsClassifier().fit(x_train, y_train) y_KNN = KNN_model.predict(x_test) DT_model = DecisionTreeClassifier().fit(x_train, y_train) y_DT = DT_model.predict(x_test) RF_model = RandomForestClassifier(n_jobs=-1).fit(x_train, y_train) y_RF = RF_model.predict(x_test) SVM_model = SVC(kernel="rbf").fit(x_train, y_train) # high-dimension y_SVM = SVM_model.predict(x_test) accuracy_score_list_GNB.append(accuracy_score(y_test, y_GNB)) accuracy_score_list_BNB.append(accuracy_score(y_test, y_BNB)) accuracy_score_list_MNB.append(accuracy_score(y_test, y_MNB)) accuracy_score_list_KNN.append(accuracy_score(y_test, y_KNN)) accuracy_score_list_DT.append(accuracy_score(y_test, y_DT)) accuracy_score_list_RF.append(accuracy_score(y_test, y_RF)) accuracy_score_list_SVM.append(accuracy_score(y_test, y_SVM)) importances = RF_model.feature_importances_ indices = np.argsort(importances) # print(features[indices], importances[indices]) # Rank of attributes are always same in every fold. print("Accuracy score using Gaussian Naive Bayes: %.2f" % mean(accuracy_score_list_GNB)) print("Accuracy score using Bernoulli Naive Bayes: %.2f" % mean(accuracy_score_list_BNB)) print("Accuracy score using Multinomial Naive Bayes: %.2f" % mean(accuracy_score_list_MNB)) print("Accuracy score using KNN: %.2f" % mean(accuracy_score_list_KNN)) print("Accuracy score using Decision Tree: %.2f" % mean(accuracy_score_list_DT)) print("Accuracy score using Random Forests: %.2f" % mean(accuracy_score_list_RF)) print("Accuracy score using SVM: %.2f" % mean(accuracy_score_list_SVM)) # Accuracy score using Gaussian Naive Bayes: 0.76 # Accuracy score using Bernoulli Naive Bayes: 0.42 # Accuracy score using Multinomial Naive Bayes: 0.43 # Accuracy score using KNN: 0.86 # Accuracy score using Decision Tree: 0.80 # Accuracy score using Random Forests: 0.87 # Accuracy score using SVM: 0.87 important_attributes = features[indices][:3].format() print("THREE most important attributes: ", important_attributes)
y = y.astype('int') y = y.flatten() #Load test data z = np.load('data/test_encoded_array_new.npy') t = np.load('data/test_target_array_new.npy') t = t.astype('int') t = t.flatten() #Predict using Naive Bayes Model clf = BernoulliNB(alpha=1) nmf = NMF(n_components=500, init='random', random_state=0) x_500d = nmf.fit_transform(x) z_500d = nmf.transform(z) clf.fit(x_500d, y) p = clf.predict(z_500d) # Compute training time endTime = datetime.datetime.now() - startTime print("Total time taken to train: ", endTime) print("\n") print("Bernoulli Naive Bayes with 500 features and alpha = 1") # Compute accuracy accuracy = metrics.accuracy_score(t, p, normalize=False) print("Accuracy: ", (accuracy / len(t)) * 100) # Confusion matrix confusion_matrix = metrics.confusion_matrix(t, p) print("Confusion Matrix:\n", confusion_matrix)
#Upgrade the Posterior Probability of a packet being not an attack packet Test_Post_N = CalcPosterior(1-NewPrior, LLA, LLN) print "Test Posterior No attack", Test_Post_N attack_count = attack_count+ DetectAttack(PThres,Test_Post_A) i=i+1 if i == 20: store_rtt=[] store_grat=[] i=0 #We will give the training data to fit into the Naive Bayes Model ber = BernoulliNB() ber.fit(Train_Features, Train_Labels) testMat = np.array testMat = np.zeros((Train_len,5),dtype='i,f,f,i,b') for rtt,rss,seq_flag,grat in zip(logtest_rtt,logtest_rss,logtest_seq,logtest_grat): testMat[j,1]= testMat[j,0]= ber.predict(testMat[j,:]) j=j+1
centers = clus.cluster_centers_; print("Centerss:") print(centers) labels = clus.labels_; print("Labels:") print(labels) #print(UsersId) #UsersId.astype(float) #print(preprocessing.scale(UsersId)) #Predicting from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB() clf.fit(SomeData, Popular) BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) print("Predict popularity according to likes, retweets and followers") pred = clf.predict(SomeData) print(pred) #for s in pred: # print(s) print("Predict Probabilidade") print(clf.predict_proba(SomeData)) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() #dataNum = vec.fit_transform(SomeData).toarray() #print(dataNum)