def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target): clf = BernoulliNB() clf.fit(train_predictors,train_target) predicted = clf.predict(test_predictors) accuracy = accuracy_score(test_target, predicted) print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy) return accuracy,predicted
def tryBinomialNaiveBayes(goFast): best_score = 0 from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm") validation_data, validation_labels = load_svmlight_file("dt1.vld.svm") testing_data, testing_labels = load_svmlight_file("dt1.tst.svm") from sklearn.naive_bayes import BernoulliNB for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: for fit_prior_value in [True, False]: binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value) binary_operator.fit(training_data,training_labels) current_score = binary_operator.score(validation_data,validation_labels) print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value print "Current score: " + str(current_score) if current_score > best_score: best_score = current_score print "***NEW MAXIMUM SCORE: " + str(best_score) print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value print "Best score was " + str(best_score)
def render_content(self): if self.text_source is None: return "No text source selected." from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import BernoulliNB from sklearn import metrics self.dm("creating vectorizer") vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size) data = self.get_column_data(self.text_source) self.dm("using vectorizer") X_train = vectorizer.fit_transform(data) Y_train = self.get_column_data(self.code_source) self.dm("creating classifier") clf = BernoulliNB() clf.fit(X_train, Y_train) accuracy = clf.score(X_train, Y_train) self.dm("predicting") pred = clf.predict(X_train) cm = metrics.confusion_matrix(Y_train, pred) self.dm("displaying result") html_output = "accuracy is " + str(round(accuracy, 2)) html_output += '<pre>'+ str(cm) + '</pre>' return html_output
class NaiveBayesClassifierBernoulli: """ this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class """ def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath): self.X,self.Y = load_svmlight_file(matrixFileName) self.dictionary = pickle.load(open(dicFileName, "rb")) self.bernoulliNB = BernoulliNB() self.bernoulliNB.fit(self.X, self.Y) self.matrixParser = Parser.MatrixParserForLearning() def classifyOneSentence(self, string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) return self.bernoulliNB.predict(row) else : return None def classifyOneSentenceWithProbability(self,string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) a = self.bernoulliNB.predict_proba(row) return a[0][1] - a[0][0] else : return None
def NB_train_classifier(train_x, train_y): """ Returns the predictions on the validation set """ classifier = BernoulliNB() classifier.fit(train_x, train_y) return classifier
def bernoulli_classify(): clf = BernoulliNB() traindata = [] traintarget = [] for f in glob.glob("../../../res/articles/training_data/*-articles.json"): target = f.replace("-articles.json", "") target = re.sub(r".*\/+", "", target) output = readWholeFileBernoulli(f, target) traindata.extend(output[0]) traintarget.extend(output[1]) testdata = [] testtarget = [] for f in glob.glob("../../../res/articles/test_data/*-articles.json"): target = f.replace("-articles.json", "") target = re.sub(r".*\/+", "", target) output = readWholeFileBernoulli(f, target) testdata.extend(output[0]) testtarget.extend(output[1]) clf.fit(traindata, traintarget) ncorrect = 0 total = len(testdata) for i in range(len(testdata)): predict = clf.predict(testdata[i]) correct = testtarget[i] if correct == predict[0]: ncorrect += 1 print ("Correct: {0} - Predicted: {1}".format(correct, predict[0])) print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total
def test_BernouliNB2(): X = np.array([ [0, 1], [1, 1], [1, 0], [-1, 1], [1000, 1000], [1000, 10001], [998, 800], [990, 1100], ] ) print 'X ' + str(X) #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) Y = np.array([1, 2, 3, 4, 5, 6, 7, 8]) print 'Y ' + str(Y) clf = BernoulliNB(alpha = 1) clf.fit(X, Y) X2 = np.array( [ [1002, 1010], [1010, 910], [1003, 980], [1008, 1030], [-1, -1], [-3, -10], [40, 1], [1, -100], ] ) for i in xrange(len(X2)): #pred_ret = clf.predict_proba(X2[i]) pred_ret = clf.predict(X2[i]) print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
def MungeData(train, test): todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47'] print(todrop) train.drop(todrop, axis=1, inplace=True) test.drop(todrop, axis=1, inplace=True) features = train.columns[2:] for col in features: if((train[col].dtype == 'object')): print(col) train, binfeatures = Binarize(col, train) test, _ = Binarize(col, test, binfeatures) nb = BernoulliNB() nb.fit(train[col+'_'+binfeatures].values, train.target.values) train[col] = \ nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1] test[col] = \ nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1] train.drop(col+'_'+binfeatures, inplace=True, axis=1) test.drop(col+'_'+binfeatures, inplace=True, axis=1) features = train.columns[2:] train[features] = train[features].astype(float) test[features] = test[features].astype(float) train.fillna(-1, inplace=True) test.fillna(-1, inplace=True) return train, test
def bnb_fit(train_data, train_lbl_data): from sklearn.naive_bayes import BernoulliNB print "Starts bnb" bnb = BernoulliNB() bnb.fit(train_data, train_lbl_data) return bnb
def predict(cur, plyr_id, game_plyrs): #creates training set (called 'X') for plyr all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches) games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in n_cols = all_plyrs.shape[0] #int m_rows = games.shape[0] #int zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe populate_training_set(cur, X, games, plyr_id) print "X: ", X.values #creates vector of known output values Y = training_output_vector(cur, games, plyr_id) print "(len) Y: ", len(Y), Y test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe update_training_matrix(game_plyrs, 0, test_X) #run Bernoulli NB Classifier nb_clf = BernoulliNB() if len(X.values) == 0: return 0 nb_clf.fit(X, Y) nb_predictions = nb_clf.predict(test_X) print "test_X: ", test_X.values nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) avgs = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5] print "param vector: ", nb_clf.predict_proba(test_X)[0] print "probs: ", nb_norm_prob print avgs ev = expected_val(nb_norm_prob, avgs) #can also calc dot product return round(ev, 1)
def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias): labels = [] # Will contain all the final labels that result from the voting clf_c1 = MultinomialNB() clf_c1.fit(train_x,train_y) clf_c2 = BernoulliNB() clf_c2.fit(train_x,train_y) clf_f1 = svm.SVC(kernel='linear',cache_size = 512) clf_f1.fit(train_f_x,train_f_y) clf_f2 = svm.SVC(kernel='rbf',cache_size = 512) clf_f2.fit(train_f_x,train_f_y) p1 = clf_c1.predict(test_x) p2 = clf_c2.predict(test_x) p3 = clf_f1.predict(test_f_x) p4 = clf_f2.predict(test_f_x) if bias == 'content': for i in range(len(p1)): if p1[i] == p2[i] or p1[i] == p3[i]: labels.append(p1[i]) else: labels.append(p2[i]) elif bias == "syntax": for i in range(len(p1)): if p1[i] == p3[i] or p1[i] == p4[i]: labels.append(p1[i]) else: labels.append(p3[i]) else: print 'Please enter a valid bias ("syntax" or "content")!' p_combined = np.array(labels) accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y))) return accuracy
def doclassify(self, type='normal'): if type == 'normal': clf = BernoulliNB() clf.fit(self.train_x, self.train_y) BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) score = clf.score(self.train_x, self.train_y) print 'score = ', score
def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs): # Implementing classification model- using BernoulliNB clf_BNB = BernoulliNB(alpha=.01) clf_BNB.fit(data_train_vectors, data_train.target) y_pred = clf_BNB.predict(data_test_vectors) return y_pred
def compareClassifiers(): (observations, classes) = createObservations() observations = np.array(observations) classes = np.array(classes) # make tree classifier my_tree = tree.DecisionTreeClassifier() my_tree.fit(observations, classes) tree_score = my_tree.score(observations, classes) tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10) #print "tree score:", tree_score, "tree cv", np.mean(tree_cv) # make naive classifier naive = BernoulliNB(binarize=None) naive.fit(observations, classes) naive_score = naive.score(observations, classes) naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10) #print "naive score:", naive_score, "naive cv", np.mean(naive_cv) # make SVM classifier svm = LinearSVC() svm.fit(observations, classes) svm_score = svm.score(observations, classes) svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10) #print "svm score:", svm_score, "svm cv", np.mean(svm_cv) # make Log classifier log = LogisticRegression() log.fit(observations, classes) log_score = log.score(observations, classes) log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10) #print "log score:", log_score, "log cv", np.mean(log_cv) return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]
def test_BernouliNB4(): X = np.array([ [1, 1], [1, 1], [1, 1], [1, 0], [1, 0], [1, 0], [1, 0], [0, 0], [0, 0], [1, 0], ] ) print 'X ' + str(X) #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0]) print 'Y ' + str(Y) clf = BernoulliNB(alpha = 1) clf.fit(X, Y) X2 = np.array( [ [1, 1], ] ) for i in xrange(len(X2)): #pred_ret = clf.predict_proba(X2[i]) pred_ret = clf.predict(X2[i]) print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)
def MungeData(train, test, validation): features = train.columns[2:] print(type(features)) for col in features: if((train[col].dtype == 'object') and (col!="v22")): print(col) train, binfeatures = Binarize(col, train) test, _ = Binarize(col, test, binfeatures) validation , _ = Binarize(col, validation, binfeatures) nb = BernoulliNB() nb.fit(train[col+'_'+binfeatures].values, train.target.values) train[col] = \ nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1] test[col] = \ nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1] validation[col] = \ nb.predict_proba(validation[col+'_'+binfeatures].values)[:, 1] train.drop(col+'_'+binfeatures, inplace=True, axis=1) test.drop(col+'_'+binfeatures, inplace=True, axis=1) validation.drop(col+'_'+binfeatures, inplace=True, axis=1) train[col] = train[col].astype(float) test[col] = test[col].astype(float) validation[col] = validation[col].astype(float) return train, test, validation
def main(output_file=time.strftime('%h%d-%Hh%Mm')+'.csv', in_pkl=None): """ Generates features and fits classifier. Input command line argument is optional run name, defaults to date/time. """ logging.info("Loading features...") if not in_pkl: return "input .plk required" trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(in_pkl) logging.info("Loaded features, fitting model...") # Bernoulli Naive Bayes clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True) clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") # Use probabilities instead of binary class prediction in order to generate a ranking predicted_scores = clf.predict_log_proba(testFeatures).T[1] logging.info("Write results...") logging.info("Writing submission to %s" % output_file) f = open(output_file, "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): # only writes item_id per output spec, but may want to look at predicted_scores f.write("%d\n" % (item_id)) f.close() logging.info("Done.")
def generatePredictingModel(data): """ Build the prediction model (based on the data set we have) in order to be able to predict the category of a new video from the user input Return a classifier able to predict the category of a video based on its title and description. """ try: # Intitialize a timer to compute the time to build the model start = time.time() # Split into train-test data set X = data[[x for x in data.columns if x in ('title', 'description')]] Y = data[[x for x in data.columns if x in ('video_category_id')]] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10) # Build the 2 text corpus corpus_title = X_train['title'].values.tolist() corpus_description = X_train['description'].values.tolist() # initializes the 2 vectorizers. count_vectorizer_title = CountVectorizer() count_vectorizer_description = CountVectorizer() # learn the 2 vocabulary dictionary count_vectorizer_title.fit(corpus_title) count_vectorizer_description.fit(corpus_description) # Build the sparse matrices X_train_count_title = count_vectorizer_title.transform(X_train['title']) X_train_count_description = count_vectorizer_description.transform(X_train['description']) X_test_count_title = count_vectorizer_title.transform(X_test['title']) X_test_count_description = count_vectorizer_description.transform(X_test['description']) # Set and train the models (for title and description features) model_count_title = BernoulliNB() model_count_description = BernoulliNB() model_count_title.fit(X_train_count_title, Y_train['video_category_id']) model_count_description.fit(X_train_count_description, Y_train['video_category_id']) # Merge the title and description predictions and build a new prediction based on these 2 predictions combined new_df_train = pd.DataFrame() new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title) new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description) new_df_test = pd.DataFrame() new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title) new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description) tree = DecisionTreeClassifier() tree.fit(new_df_train, Y_train) end = time.time() execution_time = end - start print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time) time.sleep(3) return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description except: raise VideoAnalysisException(" Error while creation of predictive model ")
def score(train_X, train_y): X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10) clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_valid) return log_loss(y_valid, y_pred)
def testBoGNB(self): ''' Test on sentiment analysis task using Naive Bayes classifier with Bag-of-Word feature vectors. ''' wordlist = [] # Preprocessing of original txt data set for i, sent in enumerate(self.senti_train_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] wordlist.extend(words) for i, sent in enumerate(self.senti_test_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] wordlist.extend(words) word_dict = set(wordlist) word2index = dict(zip(word_dict, range(len(word_dict)))) # Build BoG feature train_size = len(self.senti_train_txt) test_size = len(self.senti_test_txt) pprint('Training set size: %d' % train_size) pprint('Test set size: %d' % test_size) train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float) test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float) # Using binary feature start_time = time.time() for i, sent in enumerate(self.senti_train_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] indices = map(lambda x: word2index[x], words) train_feat[i, indices] = 1.0 for i, sent in enumerate(self.senti_test_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] indices = map(lambda x: word2index[x], words) test_feat[i, indices] = 1.0 end_time = time.time() pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time)) pprint('Classification using Bernoulli Naive Bayes classifier: ') clf = BernoulliNB() # clf = LogisticRegression() clf.fit(train_feat, self.senti_train_label) train_pred_label = clf.predict(train_feat) train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size) pprint('Training accuracy = %f' % train_acc) pred_label = clf.predict(test_feat) acc = np.sum(pred_label == self.senti_test_label) / float(test_size) pprint('Accuracy: %f' % acc) train_pos_count = np.sum(self.senti_train_label == 1) train_neg_count = np.sum(self.senti_train_label == 0) test_pos_count = np.sum(self.senti_test_label == 1) test_neg_count = np.sum(self.senti_test_label == 0) pprint('Positive count in training set: %d' % train_pos_count) pprint('Negative count in training set: %d' % train_neg_count) pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count)) pprint('Positive count in test set: %d' % test_pos_count) pprint('Negative count in test set: %d' % test_neg_count) pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))
def nb_classifier(self, secret): clf = BernoulliNB() x = self.raw_attr_vector(secret) y = self.get_labels(secret) fsl = self.feature_sel(secret) new_x = fsl.transform(x) clf.fit(new_x, y) new_y = clf.predict(new_x) return clf, fsl, self.evaluate(new_y, y)
def bnb(X,y,Z,test_data): from sklearn.naive_bayes import BernoulliNB bnb = BernoulliNB() bnb.fit(X,y) #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) test_probs_bnb = bnb.predict_proba(Z)[:, 1] sub = pd.DataFrame({'enrollment_id':test_data["enrollment_id"], 'truth':test_probs_bnb}).set_index("enrollment_id") sub.to_csv('data\\result\\sixth_bnb.csv')
def train(neg=None, pos=None): the_file = os.path.dirname(os.path.abspath(__file__)) if not neg: neg = os.path.join(the_file, '..', 'origin', 'neg.txt') if not pos: pos = os.path.join(the_file, '..', 'origin', 'pos.txt') tagger = crfseg.create_tagger() tok_cn = lambda (x): crfseg.cut_zh(x, tagger) tfidf = TfidfVectorizer(tokenizer=tok_cn, sublinear_tf=True, max_df=0.5) pipe = Pipeline([ ('tfidf', tfidf), # ('svd', TruncatedSVD(32)), # ('normal', Normalizer(copy=False)) ]) ''' hasher = HashingVectorizer(n_features=2**16, tokenizer=tok_cn, non_negative=True, norm=None, binary=False) ''' #clf = SGDClassifier(loss='log', penalty='l2', alpha=0.00001, n_iter=50, fit_intercept=True) #clf = MultinomialNB() clf = BernoulliNB() neg_file = codecs.open(neg, 'r', 'utf-8') pos_file = codecs.open(pos, 'r', 'utf-8') x_train = [] y_train = [] i = 0 for line in neg_file: x_train.append(line) y_train.append(0) for line in pos_file: x_train.append(line) y_train.append(1) print 'begin transform' #x_train = hasher.transform(x_train) x_train = pipe.fit_transform(x_train) print 'begin fit' clf.fit(x_train, y_train) print 'begin save' tfidf_file = os.path.join(the_file, 'data', 'tfidf.pkl') clf_file = os.path.join(the_file, 'data', 'sgdc_clf.pkl') #_ = joblib.dump(tfidf, tfidf_file, compress=9) _ = joblib.dump(clf, clf_file, compress=9) print 'begin test' x_test = [u'这个东西真心很赞'] #x_test = hasher.transform(x_test) x_test = pipe.transform(x_test) print clf.predict(x_test)
def BernoulliNB_pred(X_train, X_test, y_train): clf_NB = BernoulliNB() clf_NB.fit(X_train, y_train) # Conveting to back, (could be used sklearn standardization function for both decoding and encoding) predictions_train = clf_NB.predict_proba(X_train) predictions = clf_NB.predict_proba(X_test) return predictions[:, 1], predictions_train[:, 1]
def bernoulli_naive_bayes(x_train, y_train, x_cv, y_cv): """ Using Naive Bayes to classify the data. """ print 'Training with NB...' clf = BernoulliNB() clf.fit(x_train, y_train) print 'Accuracy in training set: %f' % clf.score(x_train, y_train) print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv) return clf
def convertToNumeric(df): features = df.columns[2:] for col in features: if((df[col].dtype == 'object')): print "Converting {0} to numerical data".format(col) labelEncode(df, col) nb = BernoulliNB() nb.fit(df[[col]], df['target']) new_col = col + "_binarized" df[new_col] = nb.predict_proba(df[[col]])[:, 1]
def BernoulliNaiveBayes(x_train, y_train, x_cv, y_cv): """ Bernoulli Naive Bayes """ #print "Classifier: Bernoulli Naive Bayes" clfr = BernoulliNB() clfr.fit(x_train, y_train) #print 'Accuracy in training set: %f' % clfr.score(x_train, y_train) #print 'Accuracy in cv set: %f' % clfr.score(x_cv, y_cv) return clfr
def test_BernouliNB(): X = np.random.randint(2, size=(6, 100)) print 'X ' + str(X) Y = np.array([1, 2, 3, 4, 4, 5]) print 'Y ' + str(Y) clf = BernoulliNB() clf.fit(X, Y) for i in xrange(6): pred_ret = clf.predict(X[i]) print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
def evaluate_baseline(): inputs, outputs, words = preprocessing.build_data_target_matrices("aclImdb/train/pos", "aclImdb/train/neg", binary_output=True) tst_inputs, tst_outputs, _ = preprocessing.build_test_data_target_matrices("aclImdb/test/pos", "aclImdb/test/neg", words, binary_output=True) model = BernoulliNB() scores = cross_val_score(model, inputs, outputs.ravel(), cv=10) logging.info("Accuracy for %s: %.02f, std: %.02f" % ("Baseline BernoulliNB", scores.mean(), scores.std())) model.fit(inputs, outputs.ravel()) logging.info(accuracy_score(tst_outputs.ravel(), model.predict(tst_inputs)))
class NaiveBayes(StatModel): def __init__(self): self.name = "nb" self.model = BernoulliNB() def train(self, samples, labels): self.model.fit(samples, labels) def predict(self, samples): return self.model.predict(samples)
plt.ylabel('F1 Score') plt.xlabel('Log (' + param_name + ')') plt.title('Plot - Validation Set Performance of ' + classifier_name + ' w.r.t. ' + param_name) plt.show() # Naive bayes classifier print("Bernoulli Naive Bayes Classifier") # Tuning Hyper Parameter alpha hp_f1 = [] a = alpha_from while a < alpha_to: classifier = BernoulliNB(alpha=a) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_valid) score = f1_score(y_valid, y_pred, average=f1_avg_param) hp_f1.append([math.log10(a), score, a]) print("Alpha " + str(a) + " : " + str(score)) a *= alpha_step # select alpha selected_alpha = max(hp_f1, key=lambda item: item[1]) print("Alpha with best performance : " + str(selected_alpha[2])) #plot the graph performance_plot( np.asarray(hp_f1)[:, 0], np.asarray(hp_f1)[:, 1], selected_alpha, "Naive Bayes classifier", "Alpha") #Training the classifier on the selected alpha
corpus = [dictionary.doc2bow(text) for text in processed_texts] #print(corpus[1]) # ## Initializing TFIDF parameters from corpus tfidf = models.TfidfModel(corpus) # ## Creating TFIDF Matrix from data corpus_tfidf = tfidf[corpus] print(corpus_tfidf.obj) ## Creating LSA model on the tfidf lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=120) lsi.print_topics(10) lsi_corpus = [] for lsi_doc in lsi[corpus]: lsi_corpus.append([topic_component[1] for topic_component in lsi_doc]) import numpy as np lsi_corpus = np.array(lsi_corpus) print(lsi_corpus.shape) from sklearn.naive_bayes import BernoulliNB nb_model = BernoulliNB() nb_model.fit(lsi_corpus, all_categories) from sklearn.metrics import accuracy_score #backslash means the function continues in next line\ print('Accuracy on test data: {}%'.format(\ accuracy_score(all_categories, nb_model.predict(lsi_corpus))\ *100))
def extract(positive_fcs, negative_fcs, features=None): '''Takes a labeled set of feature collections (positive and negative) and the features wanted. And trains a Naive Bayes classifier on the underlying keys of the set of selected features features. If no features are selected, all are used. Returns two list of (keywords, strength) tuples ordered by strength. The first are feature keys that were predictive of the positive label and the second are the feature keys are were predictive of the negative label. ``*_fcs`` is the list of feature collections, positive label and negative label respectively. ``features`` designates which specific feature gets vectorized the other features are ignored. ''' # Vector of labels labels = np.array([1] * len(positive_fcs) + [0] * len(negative_fcs)) # Used to convert the feature collection keys into a sklearn # compatible format v = DictVectorizer(sparse=False) D = list() for fc in (positive_fcs + negative_fcs): feat = StringCounter() if not fc: logger.warn('how did we get an empty fc? %r', fc) else: # The features used to pull the keys for the classifier for f in features: feat += fc[f] D.append(feat) # Convert the list of Counters into an sklearn compatible format X = v.fit_transform(D) # Fit the sklearn Bernoulli Naive Bayes classifer clf = BernoulliNB() clf.fit(X, labels) # Extract the learned features that are predictive of the positive # and negative class positive_keywords = v.inverse_transform(clf.feature_log_prob_[1])[0] negative_keywords = v.inverse_transform(clf.feature_log_prob_[0])[0] pos_words = Counter(positive_keywords) neg_words = Counter(negative_keywords) ## make a list ordered by their weight pos_ordered = sorted(pos_words.items(), key=operator.itemgetter(1), reverse=True) neg_ordered = sorted(neg_words.items(), key=operator.itemgetter(1), reverse=True) return pos_ordered, neg_ordered
y_train # In[16]: from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB # In[17]: myber = BernoulliNB() mygau = GaussianNB() mymul = MultinomialNB() # In[19]: mygaumodel = mygau.fit(x_train, y_train) mybermodel = myber.fit(x_train, y_train) mymulmodel = mymul.fit(x_train, y_train) # In[20]: ypgau = mygaumodel.predict(x_test) ypber = mybermodel.predict(x_test) ypmul = mymulmodel.predict(x_test) # In[21]: from sklearn import metrics # In[24]: acc_gau = metrics.accuracy_score(y_target, ypgau)
from __future__ import division import numpy as np from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BernoulliNB import sys import image_util (train_set, train_label, count_label) = image_util.load_dataset(image_util.DS2_TRAIN_PATH, image_util.DS2_LABEL_SIZE) # clf = GaussianNB() clf = BernoulliNB() clf.fit(train_set, train_label) (val_set, val_label, val_count_label) = image_util.load_dataset(image_util.DS2_VAL_PATH, image_util.DS2_LABEL_SIZE) predictions = clf.predict(val_set) correct_count = 0 for row in range(image_util.DS2_VAL_SIZE): print("prediction: " + str(predictions[row])) print("actual: " + str(val_label[row])) if predictions[row] == val_label[row]: correct_count = correct_count + 1 print(correct_count / image_util.DS2_VAL_SIZE)
def bernoulli_naive_bayes_classifier(train_x, train_y): from sklearn.naive_bayes import MultinomialNB model = BernoulliNB(alpha=0.01) model.fit(train_x, train_y) return model
def bernNBClassifier(trainingVectors, targetValues): clf = BernoulliNB() clf.fit(trainingVectors, targetValues, targetValues * 10000) return (clf)
print("\n" + "SVC_classifier") log_model3 = LinearSVC() log_model3 = log_model3.fit(X=X_train, y=y_train) y_pred = log_model3.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) from sklearn.naive_bayes import MultinomialNB, BernoulliNB # MultinomialNB_classifier print("\n" + "MultinomialNB") log_model_multinomial = MultinomialNB() log_model_multinomial = log_model_multinomial.fit(X=X_train, y=y_train) y_pred = log_model_multinomial.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) # BernoulliNB ClassifierI print("\n" + "BernoulliNB") log_model_bernoulli = BernoulliNB() log_model_bernoulli = log_model_bernoulli.fit(X=X_train, y=y_train) y_pred = log_model_bernoulli.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred))
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9051851851851852 exported_pipeline = BernoulliNB(alpha=0.001, fit_prior=True) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
#制作词向量表 data = pd.read_excel('C:/Users/64191/Desktop/Contents.xlsx', sheetname=0) data.Content = data.Content.str.replace('[0-9a-zA-A]', '') jieba.load_userdict(r'C:/Users/64191/Desktop/all_words.txt') with open(r'C:/Users/64191/Desktop/mystopwords.txt', encoding='UTF-8') as f: stop_words = [i.strip('\n') for i in f.readlines()] def cut(x): words = [] for i in jieba.lcut(x): if i not in stop_words: words.append(i) result = ' '.join(words) return result word = data.Content.apply(cut) counts = CountVectorizer(min_df=0.01) data_matrix = counts.fit_transform(word).toarray() #进行分类与测试 X = pd.DataFrame(data_matrix, columns=counts.get_feature_names()) Y = data.Type X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1) bnb = BernoulliNB() bnb.fit(X_train, Y_train) pred = bnb.predict(X_test) print(classification_report(Y_test, pred))
trainData = pd.read_table('../dataset1/train.txt', header=None, encoding='gb2312', delim_whitespace=True) testData = pd.read_table('../dataset1/test.txt', header=None, encoding='gb2312', delim_whitespace=True) trainLabel = np.array(trainData.pop(3)) trainData = np.array(trainData) testLabel = np.array(testData.pop(3)) testData = np.array(testData) time_start1 = time.time() clf1 = BayesClassifier() clf1.train(trainData, trainLabel) clf1.predict(testData) score1 = clf1.accuarcy(testLabel) time_end1 = time.time() print("Accuracy of self-Bayes: %f" % score1) print("Runtime of self-Bayes:", time_end1 - time_start1) time_start = time.time() clf = BernoulliNB() clf.fit(trainData, trainLabel) clf.predict(testData) score = clf.score(testData, testLabel, sample_weight=None) time_end = time.time() print("Accuracy of sklearn-Bayes: %f" % score) print("Runtime of sklearn-Bayes:", time_end - time_start)
clf_sum = 0 lr_sum = 0 svm_li_sum = 0 svm_rbf_sum = 0 i = -1 for train, test in kf: #NBC i += 1 train_1 = train_disc_data[train] train_2 = train_conti_data[train] test_1 = train_disc_data[test] test_2 = train_conti_data[test] train_true = train_target[train] test_true = train_target[test] clf_train_disc = BernoulliNB() clf_train_disc.fit(train_1, train_true) clf_train_conti = GaussianNB() clf_train_conti.fit(train_2, train_true) result1 = clf_train_disc.predict_proba(test_1) result2 = clf_train_conti.predict_proba(test_2) result_arr = np.zeros(len(test), dtype=int) for index in range(len(test)): result_a = result1[index, 0] * result2[index, 0] result_b = result1[index, 1] * result2[index, 1] if (result_a < result_b): result_arr[index] = 1 else: result_arr[index] = 0 clf_sum += f1_score(test_true, result_arr) if (k_value == 50): t_test_value[i, 0] = f1_score(test_true, result_arr) #logistic lr_data = np.column_stack((train_conti_data, train_disc_data))
testdf = testdf.dropna() testLabel = testLabel.dropna() testLabel = testLabel.apply(int) try: svmModel = svmAlg.fit(trainDf, trainLabel) svmpred = svmModel.predict(testdf) svmAcc = accuracy_score(testLabel, svmpred) print 'SVM Accuracy : ' + str(svmAcc) gnbModel = gnb.fit(trainDf, trainLabel) gnbpred = gnbModel.predict(testdf) gnbAcc = accuracy_score(testLabel, gnbpred) print 'GNB Accuracy : ' + str(gnbAcc) bnbModel = bnb.fit(trainDf, trainLabel) bnbpred = bnbModel.predict(testdf) bnbAcc = accuracy_score(testLabel, bnbpred) print 'BNB Accuracy : ' + str(bnbAcc) treeModel = tree.fit(trainDf, trainLabel) treepred = treeModel.predict(testdf) treeAcc = accuracy_score(testLabel, treepred) print 'Decision Tree Accuracy : ' + str(treeAcc) rndModel = rnd.fit(trainDf, trainLabel) rndpred = rndModel.predict(testdf) rndAcc = accuracy_score(testLabel, rndpred) print 'Random Forest Accuracy : ' + str(rndAcc) except Exception, e: print 'model error ' + str(e)
#############################SUPPORT VECTOR MACHINES########################### from sklearn.svm import SVC svc = SVC(verbose=True, random_state=0) svc.fit(X_train, y_train) ############################### Naive Bayes ################################## from sklearn.naive_bayes import BernoulliNB BernNB = BernoulliNB(binarize=True) BernNB.fit(X_train, y_train) #### 3 SINGLE LAYER NEURAL NETWORL - PERCEPTRON ############################### X_ann = X.copy() X_train_ann = np.array(X_train.copy()) X_test_ann = np.array(X_test.copy()) y_train_ann = np.array(y_train.copy()) class NeuralNetwork(): def __init__(self): np.random.seed(4) self.synaptic_weights = 2 * np.random.random((12, 1)) - 1 def sigmoid(self, x):
from sklearn import tree from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression # Train KNeighborsClassifier Model KNN_Classifier = KNeighborsClassifier(n_jobs=-1) KNN_Classifier.fit(X_train, Y_train) # Train LogisticRegression Model LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0) LGR_Classifier.fit(X_train, Y_train) # Train Gaussian Naive Baye Model BNB_Classifier = BernoulliNB() BNB_Classifier.fit(X_train, Y_train) # Train Decision Tree Model DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0) DTC_Classifier.fit(X_train, Y_train) #Evaluate Models from sklearn import metrics models = [] models.append(('Naive Baye Classifier', BNB_Classifier)) models.append(('Decision Tree Classifier', DTC_Classifier)) models.append(('KNeighborsClassifier', KNN_Classifier)) models.append(('LogisticRegression', LGR_Classifier)) for i, v in models:
X_train, X_test, y_train, y_test = train_test_split( text_data, Y, test_size=0.25, shuffle=False) count = CountVectorizer(preprocessor=myPreprocessor, lowercase=False, tokenizer=myTokenizer, max_features=size) X_train = count.fit_transform(X_train).toarray() print("----------Train vector------------", len(X_train)) print(X_train) X_test = count.transform(X_test).toarray() print("----------Test vector------------", len(X_test)) print(X_test) start_time = time.time() clf = BernoulliNB() model = clf.fit(X_train, y_train) training_time = (time.time() - start_time) # print(y_test, y_pred) # print(model.predict_proba(X_test)) # print(precision_score(y_test, y_pred, average='micro')) # print(recall_score(y_test, y_pred, average='micro')) # print(f1_score(y_test, y_pred, average='micro')) # print(f1_score(y_test, y_pred, average='macro')) y_pred = model.predict(X_test) # print(classification_report(y_test, y_pred)) # print('Accuracy score:', accuracy_score(y_test, y_pred)) testtime = time.time() - start_time test_report = classification_report(y_test, y_pred, output_dict=True)
''' 模型搭建 ''' # 只取星期几和街区作为分类器输入特征 features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'] # 分割训练集(3/5)和测试集(2/5) training, validation = train_test_split(trainData, train_size=.60) # 朴素贝叶斯建模,计算log_loss model = BernoulliNB() nbStart = time.time() model.fit(training[features], training['crime']) nbCostTime = time.time() - nbStart predicted = np.array(model.predict_proba(validation[features])) print("朴素贝叶斯建模耗时 %f 秒" % (nbCostTime)) # 朴素贝叶斯建模耗时 0.591072 秒 print("朴素贝叶斯log损失为 %f" % (log_loss(validation['crime'], predicted))) # 朴素贝叶斯log损失为 2.615596 # 逻辑回归建模,计算log_loss model = LogisticRegression(C=.01) lrStart = time.time() model.fit(training[features], training['crime']) lrCostTime = time.time() - lrStart predicted = np.array(model.predict_proba(validation[features])) log_loss(validation['crime'], predicted) print("逻辑回归建模耗时 %f 秒" % (lrCostTime))
plt.title('ROC curve for SMV Fraud Classification') plt.xlabel('False Positive Rate (1-Specificity)') plt.ylabel('True Possitive Rate (Sensitivity)') plt.grid(True) plt.show() #END SVM MODEL #START NAIVE BAYES MODEL #Crating the train and test populations 33% in testing data set. for Naive Bayes and Decision Tree X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size = .33, random_state = 17) #NB1 BernoulliNB BernNB = BernoulliNB(binarize = 0.025) # use either 0.025 0.1 or True BernNB.fit(X1_train, Y1_train) print(BernNB) Y1_expect = Y1_test Y1_pred = BernNB.predict(X1_test) print(accuracy_score(Y1_expect, Y1_pred)) #BernNB Evalutation #Confusion Matrix confusion_matrix(Y1_expect, Y1_pred) #AUCROC Curve fpr, tpr, thresholds = metrics.roc_curve(Y1_expect, Y1_pred) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.title('ROC curve for Beroulli Naives bays Fraud Classification')
# Setup data structures to hold train, test results train_jll = np.zeros((10, 15)) test_jll = np.zeros((10, 15)) for i in range(0, 10): idx = 0 # Split datasets x_train, x_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=1. / 3, random_state=7000) for j in alphas: # 1. Create new Bernoulli Naive Bayes model using alpha value mod = BernoulliNB(alpha=j) # Fit the model to the training set mod.fit(x_train, y_train) # Compute the joint log likelihood for the training set, store it train_jll 2d array total_res = mod._joint_log_likelihood(x_train) y_train_binary = y_train * 1 entry_val = 0 # Sum-up by matching true labels for k in range(0, len(y_train)): entry_val += total_res[k][y_train_binary[k]] # Store result train_jll[i][idx] = entry_val # 2. Compute the joint log likelihood for the testing set, store it test_jll 2d array total_res = mod._joint_log_likelihood(x_test) y_test_binary = y_test * 1 entry_val = 0 # Sum-up by matching true labels for k in range(0, len(y_test)):
le.fit(dataset["Sex"]) dataset["Sex"] = le.transform(dataset["Sex"]) #assigning DV to y and IDV to x y = dataset["Pclass"] X = dataset[["Survived", "Sex", "Age", "SibSp", "Parch", "Fare"]] print(y.count()) #training the model X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #applying naive bayes algorithm from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB() #prediction y_pred = clf.fit(X_train, y_train).predict(X_test) #accuracy score print("The accuracy score is : ", accuracy_score(y_test, y_pred, normalize=True)) #confusion matrix print("The confusion matrix is: \n", confusion_matrix(y_test, y_pred))
classifier = SVC() classifier2 = DecisionTreeClassifier() classifier3 = BernoulliNB() classifier4 = GaussianNB() # shape,size,color # spherical,oval,long # small,medium,large train_x = [[0, 1, 0], [0, 2, 1], [1, 2, 2], [0, 1, 2], [2, 1, 2], [0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 1, 0], [0, 1, 0], [0, 2, 1]] train_y = [0, 1, 2, 3, 4, 5, 5, 5, 5, 0, 0, 1] test_x = [[1, 2, 2], [0, 1, 0]] # test_y = [2,5] classifier.fit(train_x, train_y) classifier2.fit(train_x, train_y) classifier3.fit(train_x, train_y) classifier4.fit(train_x, train_y) prediction = classifier.predict(test_x) prediction2 = classifier2.predict(test_x) prediction3 = classifier3.predict(test_x) prediction4 = classifier4.predict(test_x) print("prediction :", prediction) print("prediction2 :", prediction2) print("prediction3 :", prediction3) print("prediction4 :", prediction4)
def main(): #start timer start = time.time() #import data from training file and test file sparse_matrix, ranks = importData("train_drugs.dat", True, 0) test_data = importData("test.dat", False, sparse_matrix.shape[1]) #run dimensionality reduction on input data selector_tsvd = None sparse_matrix_tsvd = [] if (path.exists("./pickles/selector_tsvd.p") and path.exists("./pickles/sparse_matrix_tsvd.p")): selector_tsvd = pk.load(open("./pickles/selector_tsvd.p", "rb")) sparse_matrix_tsvd = pk.load( open("./pickles/sparse_matrix_tsvd.p", "rb")) else: svd = TruncatedSVD(n_components=200, n_iter=7, random_state=42) selector_tsvd = svd.fit(sparse_matrix, ranks) sparse_matrix_tsvd = selector_tsvd.transform(sparse_matrix) pk.dump(selector_tsvd, open("./pickles/selector_tsvd.p", "wb")) pk.dump(sparse_matrix_tsvd, open("./pickles/sparse_matrix_tsvd.p", "wb")) #run features selection on data to remove unimportant features #recursive features selection to remove most of the features that are least important selector_rfe = None sparse_matrix_rfe = [] if (path.exists("./pickles/selector_rfe.p") and path.exists("./pickles/sparse_matrix_rfe.p")): selector_rfe = pk.load(open("./pickles/selector_rfe.p", "rb")) sparse_matrix_rfe = pk.load(open("./pickles/sparse_matrix_rfe.p", "rb")) else: selector_rfe, sparse_matrix_rfe = rfeFeatureSelection( sparse_matrix, ranks) pk.dump(selector_rfe, open("./pickles/selector_rfe.p", "wb")) pk.dump(sparse_matrix_rfe, open("./pickles/sparse_matrix_rfe.p", "wb")) #recursive features selection with cross validation to chose best of reamining features selector_rfecv = None sparse_matrix_rfecv = [] if (path.exists("./pickles/selector_rfecv.p") and path.exists("./pickles/sparse_matrix_rfecv.p")): selector_rfecv = pk.load(open("./pickles/selector_rfecv.p", "rb")) sparse_matrix_rfecv = pk.load( open("./pickles/sparse_matrix_rfecv.p", "rb")) else: selector_rfecv, sparse_matrix_rfecv = rfecvFeatureSelection( sparse_matrix_rfe, ranks) pk.dump(selector_rfecv, open("./pickles/selector_rfecv.p", "wb")) pk.dump(sparse_matrix_rfecv, open("./pickles/sparse_matrix_rfecv.p", "wb")) #run chi^2 selection on original data to see how accurate it is sparse_matrix_chi = [] selector_chi = None if (path.exists("sparse_matrix_chi.p")): sparse_matrix_chi = pickle.load( open("./pickles/sparse_matrix_chi.p", "rb")) selector_chi = pickle.load(open("/pickles/selector_chi.p", "rb")) else: selector_chi, sparse_matrix_chi = chiSquareSelection( sparse_matrix, ranks) pk.dump(sparse_matrix_chi, open("./pickles/sparse_matrix_chi.p", "wb")) pk.dump(selector_chi, open("./pickles/selector_chi.p", "wb")) #account for imbalanced data with SMOTE over sampling Orig_X_resampled, Orig_y_resampled = SMOTE().fit_resample( sparse_matrix.todense(), ranks) TSVD_X_resampled, TSVD_y_resampled = SMOTE().fit_resample( sparse_matrix_tsvd, ranks) rfe_X_resampled, rfe_y_resampled = SMOTE().fit_resample( sparse_matrix_rfe, ranks) rfecv_X_resampled, rfecv_y_resampled = SMOTE().fit_resample( sparse_matrix_rfecv, ranks) chi_X_resampled, chi_y_resampled = SMOTE().fit_resample( sparse_matrix_chi, ranks) #set up classifiers, train on data #Bernoulli naive bayes nb_orig = BernoulliNB() nb_orig.fit(sparse_matrix, ranks) nb_orig_resampled = BernoulliNB() nb_orig_resampled.fit(Orig_X_resampled, Orig_y_resampled) nb_tsvd = BernoulliNB() nb_tsvd.fit(TSVD_X_resampled, TSVD_y_resampled) nb_tsvd_non_sampled = BernoulliNB() nb_tsvd_non_sampled.fit(sparse_matrix_tsvd, ranks) nb_rfec = BernoulliNB() nb_rfec.fit(selector_rfe.transform(sparse_matrix), ranks) nb_rfecv = BernoulliNB() nb_rfecv.fit(rfecv_X_resampled, rfecv_y_resampled) nb_rfecv_non_sampled = BernoulliNB() nb_rfecv_non_sampled.fit(sparse_matrix_rfecv, ranks) nb_chi = BernoulliNB() nb_chi.fit(chi_X_resampled, chi_y_resampled) #decision tree classifier dt_rfecv_resampled = DecisionTreeClassifier(random_state=0) dt_rfecv_resampled.fit(rfecv_X_resampled, rfecv_y_resampled) dt_rfecv = DecisionTreeClassifier(random_state=0) dt_rfecv.fit(sparse_matrix_rfecv, ranks) dt_orig = DecisionTreeClassifier(random_state=0) dt_orig.fit(sparse_matrix, ranks) dt_orig_resampled = DecisionTreeClassifier(random_state=0) dt_orig_resampled.fit(Orig_X_resampled, Orig_y_resampled) dt_tsvd = DecisionTreeClassifier(random_state=0) dt_tsvd.fit(sparse_matrix_tsvd, ranks) dt_tsvd_resampled = DecisionTreeClassifier(random_state=0) dt_tsvd_resampled.fit(TSVD_X_resampled, TSVD_y_resampled) dt_chi = DecisionTreeClassifier(random_state=0) dt_chi.fit(chi_X_resampled, chi_y_resampled) #run test predictions #run naive bayes predictions orig_pred = nb_orig.predict(sparse_matrix) orig_pred_resamp = nb_orig_resampled.predict(sparse_matrix) tsvd_pred = nb_tsvd.predict(selector_tsvd.transform(sparse_matrix)) tsvd_non_sampled_pred = nb_tsvd_non_sampled.predict( selector_tsvd.transform(sparse_matrix)) rfe_pred = nb_rfec.predict(selector_rfe.transform(sparse_matrix)) rfecv_pred = nb_rfecv_non_sampled.predict( selector_rfecv.transform(selector_rfe.transform(sparse_matrix))) rfecv_pred_non_sampeld = nb_rfecv.predict( selector_rfecv.transform(selector_rfe.transform(sparse_matrix))) chi_pred = nb_chi.predict(selector_chi.transform(sparse_matrix)) #run decision tree predictions dt_rfecv_resampled_pred = dt_rfecv_resampled.predict( selector_rfecv.transform(selector_rfe.transform(sparse_matrix))) dt_rfecv_pred = dt_rfecv.predict( selector_rfecv.transform(selector_rfe.transform(sparse_matrix))) dt_orig_pred = dt_orig.predict(sparse_matrix) dt_orig_resampled_pred = dt_orig_resampled.predict(sparse_matrix) dt_tsvd_pred = dt_tsvd.predict(selector_tsvd.transform(sparse_matrix)) dt_tsvd_resampled_pred = dt_tsvd_resampled.predict( selector_tsvd.transform(sparse_matrix)) dt_chi_pred = dt_chi.predict(selector_chi.transform(sparse_matrix)) #test the output f1 score #test for naive bayes orig_f1 = f1_score(ranks, orig_pred, average='macro') orig_f1_resampled = f1_score(ranks, orig_pred_resamp, average='macro') tsvd_f1 = f1_score(ranks, tsvd_pred, average='macro') tvsd_resampled_f1 = f1_score(ranks, tsvd_non_sampled_pred, average='macro') rfe_f1 = f1_score(ranks, rfe_pred, average='macro') rfecv_f1 = f1_score(ranks, rfecv_pred, average='macro') rfecv_reasmple_f1_non_sampled = f1_score(ranks, rfecv_pred_non_sampeld, average='macro') chi_f1 = f1_score(ranks, chi_pred, average='macro') #test for decision trees dt_rfec_resampled_f1 = f1_score(ranks, dt_rfecv_resampled_pred, average='macro') dt_rfecv_f1 = f1_score(ranks, dt_rfecv_pred, average='macro') dt_orig_f1 = f1_score(ranks, dt_orig_pred, average='macro') dt_orig_resampled_f1 = f1_score(ranks, dt_orig_resampled_pred, average='macro') dt_tsvd_f1 = f1_score(ranks, dt_tsvd_pred, average='macro') dt_tsvd_resampled_f1 = f1_score(ranks, dt_tsvd_resampled_pred, average='macro') dt_chi_f1 = f1_score(ranks, dt_chi_pred, average='macro') #output the different test results print('orig:', orig_f1, 'orig_resampled:', orig_f1_resampled, 'tsvd:', tsvd_f1, 'tvsd_resampled_f1:', tvsd_resampled_f1, 'rfe_f1:', rfe_f1, 'rfecv_f1:', rfecv_f1, 'rfecv_reasmple_f1_non_sampled:', rfecv_reasmple_f1_non_sampled, 'chi_f1:', chi_f1) print('dt_rfec_resampled_f1:', dt_rfec_resampled_f1, 'dt_rfecv_f1:', dt_rfecv_f1, 'dt_orig_f1:', dt_orig_f1, 'dt_orig_resampled_f1:', dt_orig_resampled_f1, 'dt_tsvd_f1:', dt_tsvd_f1, 'dt_tsvd_resampled_f1:', dt_tsvd_resampled_f1) #test with testfile using best classifier transformed_data = selector_rfe.transform(test_data) test_predict = nb_chi.predict(selector_chi.transform(test_data)) with open('test_file_prediction.dat', "w") as fp2: for num in test_predict: fp2.write(str(num) + '\n') print(len(test_predict)) #print out time elapsed end = time.time() print(end - start)
df_stack.to_csv('feature/tfidf_ridge_1_3_error_single_classfiy.csv', index=None, encoding='utf8') print('ridge特征已保存\n') ########################### bnb(BernoulliNB) ################################ print('BernoulliNB stacking') stack_train = np.zeros((len(train), number)) stack_test = np.zeros((len(test), number)) score_va = 0 for i, (tr, va) in enumerate( StratifiedKFold(score, n_folds=n_folds, random_state=1017)): print('stack:%d/%d' % ((i + 1), n_folds)) bnb = BernoulliNB() bnb.fit(train_feature[tr], score[tr]) score_va = bnb.predict_proba(train_feature[va]) score_te = bnb.predict_proba(test_feature) print(score_va) print('得分' + str(mean_squared_error(score[va], bnb.predict(train_feature[va])))) stack_train[va] += score_va stack_test += score_te stack_test /= n_folds stack = np.vstack([stack_train, stack_test]) df_stack = pd.DataFrame() for i in range(stack.shape[1]): df_stack['tfidf_bnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) df_stack.to_csv('feature/tfidf_bnb_1_3_error_single_classfiy.csv', index=None, encoding='utf8')
clf_prob = LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=0.2, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) clf_text.fit(trn_text_bow, trn_text_classes_bow) clf_description.fit(trn_description_bow, trn_description_classes_bow) trn_prob, trn_prob_class = prepare_combined_model(clf_text, clf_description, vectorizer_text, vectorizer_description, test_set) clf_prob.fit(trn_prob, trn_prob_class) # ###################### Add genders for each user ####################### print 'Predicting...' user = defaultdict(list) image_url = {} for id in database: document = database[str(id)] if document['gender'] == None and len(user[document['user']['id']]) < 5: if document['user']['profile_image_url'] != None and document['user'][
after_stem_words = [] for w in new_words: after_stem_words.append(ps.stem(w)) clean_msg = ' '.join(after_stem_words) return clean_msg df['msg'] = df.msg.apply(clean_text) print('data cleaned...') X = cv.fit_transform(df.msg).toarray() new_X = pca.fit_transform(X) y = df.iloc[:, 0].values print('going for training...') log.fit(new_X, y) print('model trained....') root = Tk() root.state('zoomed') root.configure(background='yellow') l1 = Label(root, text='Spam Detection', bg='yellow', fg='blue', font=('', 40, 'bold')) l1.place(x=190, y=20) l2 = Label(root, text='Enter msg:', bg='yellow',
test_numbers = cv.transform(new_test_data).toarray() print(test_numbers) # # Multinomial Naive Bayes : # In[10]: from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB # In[11]: mnb = MultinomialNB() mnb.fit(numbers, y) # In[12]: mnb.predict(test_numbers) # # Bernaulli Naive Bayes : # In[13]: bnb = BernoulliNB() bnb.fit(numbers, y) # In[14]: bnb.predict(test_numbers) # In[ ]:
def bernoulliNB(): X = np.random.randint(2, size=(6, 100)) Y = np.array([1, 2, 3, 4, 4, 5]) clf = BernoulliNB() clf.fit(X, Y) print(clf.predict(X[2:3]))
class NBClassifier(object): def __init__(self, decompose_func=None, preprocessor=None, nbits=15, seed=1): self.decompose_func = decompose_func self.nbits = nbits feature_size, bitmask = set_feature_size(nbits=nbits) self.feature_size = feature_size self.bitmask = bitmask self.encoding_func = make_encoder(decompose_func, preprocessors=preprocessor, bitmask=self.bitmask, seed=seed) self.classifier = BernoulliNB(alpha=0.1, binarize=None, fit_prior=True, class_prior=None) def fit(self, graphs, targets): data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) self.classifier.fit(data_mtx, targets) return self def decision_function(self, graphs): # return probability associated to largest target type data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) preds = self.classifier.predict_proba(data_mtx) # assuming binary classification and column 1 to represent positives preds = preds[:, 1].reshape(-1) return preds def predict(self, graphs): data_mtx = vectorize_graphs(graphs, encoding_func=self.encoding_func, feature_size=self.feature_size) # binarize data_mtx.data = np.where(data_mtx.data > 0, 1, 0) preds = self.classifier.predict(data_mtx) return preds def explain(self, graphs, top_k): feature_dict, feature_counts = get_feature_dict( graphs, decomposition_funcs=self.decompose_func, nbits=self.nbits, return_counts=True) # compute log-odds scores = self.classifier.feature_log_prob_[ 1, :] / self.classifier.feature_log_prob_[0, :] ranked_pos_features = np.argsort(-scores) # signature-counts stats = [(feature_dict[id].graph['signature'], feature_counts[id]) for id in feature_dict] # aggregate counts according to same signature sig_dict = dict() for sig, c in stats: if sig in sig_dict: sig_dict[sig] += c else: sig_dict[sig] = c # take logs for id in sig_dict: sig_dict[id] = math.log(sig_dict[id]) # select top_k feature_graphs = [ feature_dict[fid] for fid in ranked_pos_features[:top_k] ] c = Counter([g.graph['signature'] for g in feature_graphs]) cnt = dict([(id, c[id] / sig_dict[id]) for id in c]) tot = sum(cnt[id] for id in cnt) res = [ (cnt[id] / tot, cnt[id], id) for id in sorted(cnt.keys(), key=lambda id: cnt[id], reverse=True) ] return res
test_size=0.000001, random_state=0) print('Training SVC: ') clf = svm.SVC() clf.fit(X_train, y_train) print("SVC Accuracy Test: ", clf.score(X_test, y_test)) ######################## print('Training GNB: ') gnb = GaussianNB() gnb.fit(X_train, y_train) print("GNB Accuracy Test: ", gnb.score(X_test, y_test)) print(gnb.predict_proba(X_test[0].reshape(1, -1))) ######################## print('Training BNB: ') bnb = BernoulliNB() bnb.fit(X_train, y_train) print("BNB Accuracy Test: ", bnb.score(X_test, y_test)) ####################### ''' print('Training kNN: ') test_result=list() for jj in range (1,200,20): neigh = KNeighborsClassifier(n_neighbors=jj,p=1) neigh.fit(X_train,y_train) print("kNN Accuracy Test for",jj," neighbors: ",neigh.score(X_test,y_test)) test_result.append(neigh.score(X_test,y_test)) plt.plot(test_result) plt.ylabel('some numbers') plt.show()''' '''
randForrC.fit(trainX, yTrain) tmpSCR = randForrC.score(testX, yTest) scores['randForr'][label].append(tmpSCR) else: randForrR.fit(trainX, yTrain) tmpSCR = randForrR.score(testX, yTest) scores['randForr'][label].append(tmpSCR) # print("start adaBoost") # adaBoostC.fit(trainX, yTrain) # tmpSCR = adaBoostC.score(testX, yTest) # scores['adaBoost'][label].append(tmpSCR) print("start bernoulli NB") if cnt < 2: bernNB.fit(trainX, yTrain) tmpSCR = bernNB.score(testX, yTest) scores['bernNB'][label].append(tmpSCR) else: gausRidge.fit(trainX, yTrain) tmpSCR = gausRidge.score(testX, yTest) scores['bernNB'][label].append(tmpSCR) # print("start gradient boost") # gradBoostC.fit(trainX, yTrain) # tmpSCR = gradBoostC.score(trainX, yTest) # scores['gradBoost'][label].append(tmpSCR) print("start SVM") if cnt < 2: svmC.fit(trainX, yTrain)