def test_BernouliNB2(): X = np.array([ [0, 1], [1, 1], [1, 0], [-1, 1], [1000, 1000], [1000, 10001], [998, 800], [990, 1100], ] ) print 'X ' + str(X) #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) Y = np.array([1, 2, 3, 4, 5, 6, 7, 8]) print 'Y ' + str(Y) clf = BernoulliNB(alpha = 1) clf.fit(X, Y) X2 = np.array( [ [1002, 1010], [1010, 910], [1003, 980], [1008, 1030], [-1, -1], [-3, -10], [40, 1], [1, -100], ] ) for i in xrange(len(X2)): #pred_ret = clf.predict_proba(X2[i]) pred_ret = clf.predict(X2[i]) print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
def train_model(data, target): """ Splits the data into a training set and test set Instatiating a Bernoulli Naive Bayes classifier, train on the training set, and then evaluate the model based upon the test set """ # Using cross-validation # TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before # sampling in order to improve the representativeness of the sampling train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data, target, test_size=0.4) # Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment classifier = BernoulliNB().fit(train_tweets, train_sentiment) predicted = classifier.predict(validation_tweets) # Using the cross-validation split, evaluate the accuracy of the predicted tweets evaluate_model(validation_sentiment, predicted) # Pickling the classifier pickle_file = open('nb_classifier.pickle', 'wb') pickle.dump(classifier, pickle_file) pickle_file.close() return classifier
def train(cutoffs): print "\n========== Start Training ==========" if len(__TRAIN_DATA) == 3: list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1], __TRAIN_DATA[2]) else: list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1]) clf = BernoulliNB(fit_prior=True) for i in range(len(list_io_addr)): path_in = list_io_addr[i] print "\nGenerating training set from {}".format(path_in) with open(path_in, "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: print "Discarding selected features......" X = discard_vars(X, cutoffs) vector_len = len(X[0]) X_train = X[:, 0:vector_len-1] y_train = X[:, vector_len-1] print "Done" # sm = SMOTE(ratio=0.9) # X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train) print "Fitting Model......" clf.partial_fit(X_train, y_train, classes=[0, 1]) print "Done" with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out)
def test_discretenb_predict_proba(): """Test discrete NB classes' probability scores""" # The 100s below distinguish Bernoulli from multinomial. X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]] X_multinomial = [[0, 1], [1, 3], [4, 0]] # Confirm that the 100s above distinguish Bernoulli from multinomial y = [0, 0, 1] cls_b = BernoulliNB().fit(X_bernoulli, y) cls_m = MultinomialNB().fit(X_bernoulli, y) assert_not_equal(cls_b.predict(X_bernoulli)[-1], cls_m.predict(X_bernoulli)[-1]) # test binary case (1-d output) y = [0, 0, 2] # 2 is regression test for binary case, 02e673 for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict(X[-1]), 2) assert_equal(clf.predict_proba(X[0]).shape, (1, 2)) assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1), np.array([1., 1.]), 6) # test multiclass case (2-d output, must sum to one) y = [0, 1, 2] for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict_proba(X[0]).shape, (1, 3)) assert_equal(clf.predict_proba(X[:2]).shape, (2, 3)) assert_almost_equal(np.sum(clf.predict_proba(X[1])), 1) assert_almost_equal(np.sum(clf.predict_proba(X[-1])), 1) assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1) assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
def main(output_file=time.strftime('%h%d-%Hh%Mm')+'.csv', in_pkl=None): """ Generates features and fits classifier. Input command line argument is optional run name, defaults to date/time. """ logging.info("Loading features...") if not in_pkl: return "input .plk required" trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(in_pkl) logging.info("Loaded features, fitting model...") # Bernoulli Naive Bayes clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True) clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") # Use probabilities instead of binary class prediction in order to generate a ranking predicted_scores = clf.predict_log_proba(testFeatures).T[1] logging.info("Write results...") logging.info("Writing submission to %s" % output_file) f = open(output_file, "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): # only writes item_id per output spec, but may want to look at predicted_scores f.write("%d\n" % (item_id)) f.close() logging.info("Done.")
def tryBinomialNaiveBayes(goFast): best_score = 0 from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm") validation_data, validation_labels = load_svmlight_file("dt1.vld.svm") testing_data, testing_labels = load_svmlight_file("dt1.tst.svm") from sklearn.naive_bayes import BernoulliNB for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: for fit_prior_value in [True, False]: binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value) binary_operator.fit(training_data,training_labels) current_score = binary_operator.score(validation_data,validation_labels) print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value print "Current score: " + str(current_score) if current_score > best_score: best_score = current_score print "***NEW MAXIMUM SCORE: " + str(best_score) print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value print "Best score was " + str(best_score)
def compareClassifiers(): (observations, classes) = createObservations() observations = np.array(observations) classes = np.array(classes) # make tree classifier my_tree = tree.DecisionTreeClassifier() my_tree.fit(observations, classes) tree_score = my_tree.score(observations, classes) tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10) #print "tree score:", tree_score, "tree cv", np.mean(tree_cv) # make naive classifier naive = BernoulliNB(binarize=None) naive.fit(observations, classes) naive_score = naive.score(observations, classes) naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10) #print "naive score:", naive_score, "naive cv", np.mean(naive_cv) # make SVM classifier svm = LinearSVC() svm.fit(observations, classes) svm_score = svm.score(observations, classes) svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10) #print "svm score:", svm_score, "svm cv", np.mean(svm_cv) # make Log classifier log = LogisticRegression() log.fit(observations, classes) log_score = log.score(observations, classes) log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10) #print "log score:", log_score, "log cv", np.mean(log_cv) return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]
def main(): # Get the data and targets df = pd.read_csv('train1.csv') df = df[df.rating != 'rating'] corpus = [review for review in df.review] splitPoint = len(corpus)*2/3 trainingCorpus = corpus[:splitPoint] testCorpus = corpus[splitPoint:] target = [rating for rating in df.rating] trainingTarget = np.array(target[:splitPoint]) testTarget = np.array(target[splitPoint:]) # Train the algorithm train_X, vocabList = createVectorizer(trainingCorpus, 'None', True) NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget) # Test the algorithm test_X = createVectorizer(testCorpus, vocabList, True) test_predict = NB_Bern_model.predict(test_X) print(np.mean(test_predict == testTarget)) print metrics.classification_report(testTarget, test_predict, target_names=['0', '1']) # Make Predictions predict_df = pd.read_csv('test2.csv') predictCorpus = [review for review in predict_df.review] member = [memberid for memberid in predict_df.ID] predict_X = createVectorizer(predictCorpus, vocabList, True) predictions = NB_Bern_model.predict(predict_X) predict_df.columns = ['ID', 'Predicted'] for i in range(len(member)): predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i] predict_df.to_csv('submission1.csv', sep = ',', index=False)
def synergy_naive_bayes(data,target): # generate champion relations as binaries for i in xrange(len(data)): temp = [] for j in xrange(len(data[i])): if data[i][j] == -1: temp.append(1) else: temp.append(0) for j in xrange(len(data[i])): if data[i][j] == 1: temp.append(1) else: temp.append(0) num_champ = 124 for j in xrange(num_champ): for k in xrange(j,num_champ): temp.append(temp[j]*temp[k]) temp.append(temp[j+num_champ]*temp[k+num_champ]) data[i] = temp X = array(data) y = array(target) combined = zip(X, y) shuffle(combined) gnb = BernoulliNB() y_pred = gnb.fit(X[:len(X) * 4 / 5], y[:len(y) * 4 / 5]).predict(X[len(X) * 4 / 5:]) print (metrics.classification_report(y[len(y) * 4 / 5:],y_pred))
def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias): labels = [] # Will contain all the final labels that result from the voting clf_c1 = MultinomialNB() clf_c1.fit(train_x,train_y) clf_c2 = BernoulliNB() clf_c2.fit(train_x,train_y) clf_f1 = svm.SVC(kernel='linear',cache_size = 512) clf_f1.fit(train_f_x,train_f_y) clf_f2 = svm.SVC(kernel='rbf',cache_size = 512) clf_f2.fit(train_f_x,train_f_y) p1 = clf_c1.predict(test_x) p2 = clf_c2.predict(test_x) p3 = clf_f1.predict(test_f_x) p4 = clf_f2.predict(test_f_x) if bias == 'content': for i in range(len(p1)): if p1[i] == p2[i] or p1[i] == p3[i]: labels.append(p1[i]) else: labels.append(p2[i]) elif bias == "syntax": for i in range(len(p1)): if p1[i] == p3[i] or p1[i] == p4[i]: labels.append(p1[i]) else: labels.append(p3[i]) else: print 'Please enter a valid bias ("syntax" or "content")!' p_combined = np.array(labels) accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y))) return accuracy
class NaiveBayesClassifierBernoulli: """ this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class """ def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath): self.X,self.Y = load_svmlight_file(matrixFileName) self.dictionary = pickle.load(open(dicFileName, "rb")) self.bernoulliNB = BernoulliNB() self.bernoulliNB.fit(self.X, self.Y) self.matrixParser = Parser.MatrixParserForLearning() def classifyOneSentence(self, string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) return self.bernoulliNB.predict(row) else : return None def classifyOneSentenceWithProbability(self,string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) a = self.bernoulliNB.predict_proba(row) return a[0][1] - a[0][0] else : return None
def naive_bayes(data, target): # change data to binary for i in xrange(len(data)): temp = [] for j in xrange(len(data[i])): if data[i][j] == -1: temp.append(1) else: temp.append(0) for j in xrange(len(data[i])): if data[i][j] == 1: temp.append(1) else: temp.append(0) data[i] = temp X = array(data) y = array(target) combined = zip(X, y) shuffle(combined) gnb = BernoulliNB() y_pred = gnb.fit(X[:len(X) * 4 / 5], y[:len(y) * 4 / 5]).predict(X[len(X) * 4 / 5:]) print (metrics.classification_report(y[len(y) * 4 / 5:],y_pred))
def bernoulli_classify(): clf = BernoulliNB() traindata = [] traintarget = [] for f in glob.glob("../../../res/articles/training_data/*-articles.json"): target = f.replace("-articles.json", "") target = re.sub(r".*\/+", "", target) output = readWholeFileBernoulli(f, target) traindata.extend(output[0]) traintarget.extend(output[1]) testdata = [] testtarget = [] for f in glob.glob("../../../res/articles/test_data/*-articles.json"): target = f.replace("-articles.json", "") target = re.sub(r".*\/+", "", target) output = readWholeFileBernoulli(f, target) testdata.extend(output[0]) testtarget.extend(output[1]) clf.fit(traindata, traintarget) ncorrect = 0 total = len(testdata) for i in range(len(testdata)): predict = clf.predict(testdata[i]) correct = testtarget[i] if correct == predict[0]: ncorrect += 1 print ("Correct: {0} - Predicted: {1}".format(correct, predict[0])) print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total
def NB_train_classifier(train_x, train_y): """ Returns the predictions on the validation set """ classifier = BernoulliNB() classifier.fit(train_x, train_y) return classifier
def MungeData(train, test): todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47'] print(todrop) train.drop(todrop, axis=1, inplace=True) test.drop(todrop, axis=1, inplace=True) features = train.columns[2:] for col in features: if((train[col].dtype == 'object')): print(col) train, binfeatures = Binarize(col, train) test, _ = Binarize(col, test, binfeatures) nb = BernoulliNB() nb.fit(train[col+'_'+binfeatures].values, train.target.values) train[col] = \ nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1] test[col] = \ nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1] train.drop(col+'_'+binfeatures, inplace=True, axis=1) test.drop(col+'_'+binfeatures, inplace=True, axis=1) features = train.columns[2:] train[features] = train[features].astype(float) test[features] = test[features].astype(float) train.fillna(-1, inplace=True) test.fillna(-1, inplace=True) return train, test
def bnb_fit(train_data, train_lbl_data): from sklearn.naive_bayes import BernoulliNB print "Starts bnb" bnb = BernoulliNB() bnb.fit(train_data, train_lbl_data) return bnb
def naive_bayes(df,column): reviews_pn = df[df['class'].isin(['positive','negative'])] comments = list(reviews_pn[column].values) classes = list(reviews_pn['class'].values) # preprocess creates the term frequency matrix for the review data set stop = stopwords.words('english') count_vectorizer = CountVectorizer(stop_words = stop, ngram_range=(1,3)) comments1 = count_vectorizer.fit_transform(comments) tfidf_comments = TfidfTransformer(use_idf=True).fit_transform(comments1) # preparing data for split validation. 60% training, 40% test data_train,data_test,target_train,target_test = cross_validation.train_test_split(tfidf_comments,classes,test_size=0.4,random_state=43) classifier = BernoulliNB().fit(data_train,target_train) predicted = classifier.predict(data_test) print classification_report(target_test,predicted) print "The accuracy score is {:.2%}".format(accuracy_score(target_test,predicted)) most_informative_feature_for_binary_classification(count_vectorizer,classifier,n=20) #predict on unknown reviews_nc = reviews_df[reviews_df['class'] == ''] comments_nc = list(reviews_nc[column].values) comments_nc1 = count_vectorizer.transform(comments_nc) tfidf_comments_nc = TfidfTransformer(use_idf=True).fit_transform(comments_nc1) new_predicted = classifier.predict(tfidf_comments_nc) print "negative = %s" %sum(new_predicted == 'negative') print "positive = %s" %sum(new_predicted == 'positive')
def doclassify(self, type='normal'): if type == 'normal': clf = BernoulliNB() clf.fit(self.train_x, self.train_y) BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) score = clf.score(self.train_x, self.train_y) print 'score = ', score
def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target): clf = BernoulliNB() clf.fit(train_predictors,train_target) predicted = clf.predict(test_predictors) accuracy = accuracy_score(test_target, predicted) print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy) return accuracy,predicted
def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs): # Implementing classification model- using BernoulliNB clf_BNB = BernoulliNB(alpha=.01) clf_BNB.fit(data_train_vectors, data_train.target) y_pred = clf_BNB.predict(data_test_vectors) return y_pred
def test_BernouliNB4(): X = np.array([ [1, 1], [1, 1], [1, 1], [1, 0], [1, 0], [1, 0], [1, 0], [0, 0], [0, 0], [1, 0], ] ) print 'X ' + str(X) #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0]) print 'Y ' + str(Y) clf = BernoulliNB(alpha = 1) clf.fit(X, Y) X2 = np.array( [ [1, 1], ] ) for i in xrange(len(X2)): #pred_ret = clf.predict_proba(X2[i]) pred_ret = clf.predict(X2[i]) print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)
def render_content(self): if self.text_source is None: return "No text source selected." from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import BernoulliNB from sklearn import metrics self.dm("creating vectorizer") vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size) data = self.get_column_data(self.text_source) self.dm("using vectorizer") X_train = vectorizer.fit_transform(data) Y_train = self.get_column_data(self.code_source) self.dm("creating classifier") clf = BernoulliNB() clf.fit(X_train, Y_train) accuracy = clf.score(X_train, Y_train) self.dm("predicting") pred = clf.predict(X_train) cm = metrics.confusion_matrix(Y_train, pred) self.dm("displaying result") html_output = "accuracy is " + str(round(accuracy, 2)) html_output += '<pre>'+ str(cm) + '</pre>' return html_output
def generatePredictingModel(data): """ Build the prediction model (based on the data set we have) in order to be able to predict the category of a new video from the user input Return a classifier able to predict the category of a video based on its title and description. """ try: # Intitialize a timer to compute the time to build the model start = time.time() # Split into train-test data set X = data[[x for x in data.columns if x in ('title', 'description')]] Y = data[[x for x in data.columns if x in ('video_category_id')]] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10) # Build the 2 text corpus corpus_title = X_train['title'].values.tolist() corpus_description = X_train['description'].values.tolist() # initializes the 2 vectorizers. count_vectorizer_title = CountVectorizer() count_vectorizer_description = CountVectorizer() # learn the 2 vocabulary dictionary count_vectorizer_title.fit(corpus_title) count_vectorizer_description.fit(corpus_description) # Build the sparse matrices X_train_count_title = count_vectorizer_title.transform(X_train['title']) X_train_count_description = count_vectorizer_description.transform(X_train['description']) X_test_count_title = count_vectorizer_title.transform(X_test['title']) X_test_count_description = count_vectorizer_description.transform(X_test['description']) # Set and train the models (for title and description features) model_count_title = BernoulliNB() model_count_description = BernoulliNB() model_count_title.fit(X_train_count_title, Y_train['video_category_id']) model_count_description.fit(X_train_count_description, Y_train['video_category_id']) # Merge the title and description predictions and build a new prediction based on these 2 predictions combined new_df_train = pd.DataFrame() new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title) new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description) new_df_test = pd.DataFrame() new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title) new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description) tree = DecisionTreeClassifier() tree.fit(new_df_train, Y_train) end = time.time() execution_time = end - start print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time) time.sleep(3) return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description except: raise VideoAnalysisException(" Error while creation of predictive model ")
def learn_model(data, target): # preparing data for split validation. 80% training, 20% test data_train, data_test, target_train, target_test = cross_validation.train_test_split( data, target, test_size=0.2, random_state=43 ) classifier = BernoulliNB().fit(data_train, target_train) predicted = classifier.predict(data_test) evaluate_model(target_test, predicted)
def score(train_X, train_y): X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10) clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_valid) return log_loss(y_valid, y_pred)
def testBoGNB(self): ''' Test on sentiment analysis task using Naive Bayes classifier with Bag-of-Word feature vectors. ''' wordlist = [] # Preprocessing of original txt data set for i, sent in enumerate(self.senti_train_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] wordlist.extend(words) for i, sent in enumerate(self.senti_test_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] wordlist.extend(words) word_dict = set(wordlist) word2index = dict(zip(word_dict, range(len(word_dict)))) # Build BoG feature train_size = len(self.senti_train_txt) test_size = len(self.senti_test_txt) pprint('Training set size: %d' % train_size) pprint('Test set size: %d' % test_size) train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float) test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float) # Using binary feature start_time = time.time() for i, sent in enumerate(self.senti_train_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] indices = map(lambda x: word2index[x], words) train_feat[i, indices] = 1.0 for i, sent in enumerate(self.senti_test_txt): words = sent.split() words = [word.lower() for word in words if len(word) > 2] indices = map(lambda x: word2index[x], words) test_feat[i, indices] = 1.0 end_time = time.time() pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time)) pprint('Classification using Bernoulli Naive Bayes classifier: ') clf = BernoulliNB() # clf = LogisticRegression() clf.fit(train_feat, self.senti_train_label) train_pred_label = clf.predict(train_feat) train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size) pprint('Training accuracy = %f' % train_acc) pred_label = clf.predict(test_feat) acc = np.sum(pred_label == self.senti_test_label) / float(test_size) pprint('Accuracy: %f' % acc) train_pos_count = np.sum(self.senti_train_label == 1) train_neg_count = np.sum(self.senti_train_label == 0) test_pos_count = np.sum(self.senti_test_label == 1) test_neg_count = np.sum(self.senti_test_label == 0) pprint('Positive count in training set: %d' % train_pos_count) pprint('Negative count in training set: %d' % train_neg_count) pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count)) pprint('Positive count in test set: %d' % test_pos_count) pprint('Negative count in test set: %d' % test_neg_count) pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))
def learnBModel(ip,label,tst,tst_label): vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(ip.data) X_test = vectorizer.transform(tst.data) tfidf_train = TfidfTransformer(use_idf=False).fit_transform(X_train) tfidf_test = TfidfTransformer(use_idf=False).fit_transform(X_test) classifier = BernoulliNB().fit(tfidf_train,label) predicted_BModel = classifier.predict(tfidf_test) evaluate_model(tst_label,predicted_BModel)
def naive_bayesB_classifier(X_train, categories, X_test, test_categories): from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB(alpha = 0.10000000000000001).fit(X_train, categories) y_nb_predicted = clf.predict(X_test) print "\n Here is the classification report for Naive Bayes classifier:" print metrics.classification_report(test_categories, y_nb_predicted) print "Accuracy score:" print metrics.accuracy_score(test_categories, y_nb_predicted) to_latex(test_categories, y_nb_predicted)
def BernoulliNB_pred(X_train, X_test, y_train): clf_NB = BernoulliNB() clf_NB.fit(X_train, y_train) # Conveting to back, (could be used sklearn standardization function for both decoding and encoding) predictions_train = clf_NB.predict_proba(X_train) predictions = clf_NB.predict_proba(X_test) return predictions[:, 1], predictions_train[:, 1]
def nb_classifier(self, secret): clf = BernoulliNB() x = self.raw_attr_vector(secret) y = self.get_labels(secret) fsl = self.feature_sel(secret) new_x = fsl.transform(x) clf.fit(new_x, y) new_y = clf.predict(new_x) return clf, fsl, self.evaluate(new_y, y)
# tf-idf处理 vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True) x_train = vectorizer.fit_transform(data_train.data) x_test = vectorizer.transform(data_test.data) print('训练集样本个数:%d,特征个数:%d' % x_train.shape) print('停止词:\n', end=' ') #pprint(vectorizer.get_stop_words()) feature_names = np.asarray(vectorizer.get_feature_names()) # 比较分类器结果 clfs = (MultinomialNB(), BernoulliNB()) result = [] for clf in clfs: r = make_test(clf) result.append(r) print('\n') result = np.array(result) time_train, time_test, err, names = result.T time_train = time_train.astype(np.float) time_test = time_test.astype(np.float) err = err.astype(np.float) x = np.arange(len(time_train)) mpl.rcParams['font.sans-serif'] = ['simHei'] mpl.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(10, 7), facecolor='w')
##datasets with a validation set X_train2 = full_df[:1120000, :] X_valid = full_df[1120000:1600000, :] from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_train1 = le.fit_transform(full_data['Sentiment']) y_train2 = le.transform(full_data['Sentiment'][:1120000]) y_valid = le.transform(full_data['Sentiment'][1120000:]) y_test = le.transform(test_data_pos_neg['Sentiment']) ######Try Binomial Naive Bayes Model without word stemming###### from sklearn.naive_bayes import BernoulliNB ##Convert word frequency matrix into binary matrix X_train1_bin = X_train1.copy() X_train1_bin[X_train1_bin > 0] = 1 clf_ber_bayes = BernoulliNB() clf_ber_bayes.fit(X_train1_bin, y_train1) train_preds = clf_ber_bayes.predict(X_train1_bin) accuracy_score(train_preds, y_train1) #Convert test dataframe to binary X_test_bin = X_test.copy() X_test_bin[X_test_bin > 0] = 1 test_preds = clf_ber_bayes.predict(X_test_bin) accuracy_score(y_test, test_preds) ##84.12 % accuracy_score
def modelTraining(X_train, X_test, y_train, y_test, f): models = {} # Linear SVC try: lsvc = LinearSVC() y_pred = lsvc.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Linear Support Vector Classifier"] = model_accr f.writelines( "\n Accuracy of Linear Support Vector Classifier is " + str(model_accr)) except: logging.info("LSVC is throwing exception") f.writelines("\n LSVC is throwing exception") # KNN try: knn = KNeighborsClassifier() y_pred = knn.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["KNN Classifier"] = model_accr f.writelines("\n Accuracy of KNN Classifier is " + str(model_accr)) except: logging.info("KNN is throwing exception") f.writelines("\n KNN is throwing exception") # DTC try: clf_gini = DecisionTreeClassifier(criterion="gini", random_state=0) y_pred = clf_gini.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Decision Tree Classifier - GINI"] = model_accr f.writelines( "\n Accuracy of Decision Tree Classifier - GINI is " + str(model_accr)) except: logging.info("DTC GINI is throwing exception") f.writelines("\n DTC GINI is throwing exception") try: clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=0) y_pred = clf_entropy.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Decision Tree Classifier - ENTROPY"] = model_accr f.writelines( "\n Accuracy of Decision Tree Classifier - ENTROPY is " + str(model_accr)) except: logging.info("DTC ENTROPY is throwing exception") f.writelines("\n DTC ENTROPY is throwing exception") # Multinomial NB try: mnb_model = MultinomialNB() y_pred = mnb_model.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Multinomial Naive Bayes"] = model_accr f.writelines("\n Accuracy of Multinomial NB is " + str(model_accr)) except: logging.info("Multinomial NB is throwing exception") f.writelines("\n Multinomial NB is throwing exception") # Bernoulli NB try: bnb_model = BernoulliNB() y_pred = bnb_model.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Bernoulli Naive Bayes"] = model_accr f.writelines("\n Accuracy of Bernoulli NB is " + str(model_accr)) except: logging.info("Bernoulli NB is throwing exception") f.writelines("\n Bernoulli NB is throwing exception") # Gaussian NB try: gnb_model = GaussianNB() y_pred = gnb_model.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Gaussian Naive Bayes"] = model_accr f.writelines("\n Accuracy of GaussianNB is " + str(model_accr)) except: logging.info("GaussianNB is throwing exception") f.writelines("\n GaussianNB is throwing exception") # ADB try: adb = AdaBoostClassifier(n_estimators=200, learning_rate=1) # Train Adaboost Classifer y_pred = adb.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["AdaBoost Classifier"] = model_accr f.writelines("\n Accuracy of AdaBoost Classifier is " + str(model_accr)) except: logging.info("AdaBoost Classifier is throwing exception") f.writelines("\n AdaBoost Classifier is throwing exception") # Random Forest Classifier try: rfc = RandomForestClassifier(n_estimators=100) y_pred = rfc.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Random Forest Classifier"] = model_accr f.writelines("\n Accuracy of Random Forest Classifier is " + str(model_accr)) except: logging.info("Random Forest Classifier is throwing exception") f.writelines( "\n Random Forest Classifier is throwing exception") return (models)
def classification_voting(X,y, nome): clf2 = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2, random_state=0) clf3 = BernoulliNB() eclf2 = VotingClassifier(estimators=[('rf', clf2), ('bnb', clf3)],voting = 'soft') classification_model_cv(X, y, eclf2, "Voting Model "+nome)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) # call garbage collection to release some memory del train, test, user, tf_csr, tfidf_csr gc.collect() # ------------------------------------------------------------------------------------------------- print('-' * 100) print(f'Gender prediction with {TARGET_FEAT}\n') models = dict( lr=LogisticRegression(random_state=seed, C=5, solver='sag'), svm=LinearSVC(random_state=seed, C=0.5), pac=PassiveAggressiveClassifier(random_state=seed, C=0.05), ridge=RidgeClassifier(random_state=seed, alpha=5), sgd=SGDClassifier(random_state=seed, penalty='l1', loss='log', alpha=1e-6), bnb=BernoulliNB(alpha=0.1), mnb=MultinomialNB(alpha=0.1), ) # specify target label y_train = label_gender # define features train_feat_gender = pd.DataFrame() test_feat_gender = pd.DataFrame() for name, model in models.items(): timer.start() stack_train, stack_test = kfold_stack_binary(kfold, model, x_train, y_train, x_test) timer.stop()
############### save_classifier = open("originalnaivebayes5k.pickle", "wb") pickle.dump(classifier, save_classifier) save_classifier.close() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100) save_classifier = open("MNB_classifier5k.pickle", "wb") pickle.dump(MNB_classifier, save_classifier) save_classifier.close() BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100) save_classifier = open("BernoulliNB_classifier5k.pickle", "wb") pickle.dump(BernoulliNB_classifier, save_classifier) save_classifier.close() LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100) save_classifier = open("LogisticRegression_classifier5k.pickle", "wb")
from sklearn.naive_bayes import BernoulliNB from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator from xgboost import XGBClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8702380952380953 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB(alpha=100.0, fit_prior=True)), StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=8, min_samples_split=20)), XGBClassifier(learning_rate=0.1, max_depth=5, min_child_weight=4, n_estimators=100, nthread=1, subsample=0.7000000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def classify(X, y, clf_type='nbc'): """ Preprocess the input documents to extract feature vector representations of them. Your features should be N-gram counts, for N<=2. 1. Experiment with the complexity of the N-gram features (i.e., unigrams, or unigrams and bigrams): `gram_min` + `gram_max` 2. Experiment with removing stop words. (see NLTK) 3. Remove infrequently occurring words and bigrams as features. You may tune the threshold at which to remove infrequent words and bigrams. 4. Search over hyperparameters for the three models (nb, svm, lr) to find the best performing model. All 4 of the above are done in the context of 10-fold cross validation on the data. On the training data, 3-fold cross validation is done to find the optimal hyperparameters (using randomized CV), which are then tested on held-out data. """ if clf_type == 'nbc': clf = BernoulliNB() params = SETTINGS_NB elif clf_type == 'svc': clf = LinearSVC() params = SETTINGS_SVC elif clf_type == 'lrc': clf = LogisticRegression() params = SETTINGS_LR else: raise Exception('invalid clf {}: {nbc, svc, lrc}'.format(clf_type)) # pipeline runs preprocessing and model during every CV loop pipe = Pipeline([ ('pre', CountVectorizer()), ('clf', clf), ]) model = RandomizedSearchCV( pipe, params, n_jobs=-1, n_iter=N_CV, cv=INNER, scoring='f1_macro' ) results = { 'test': {'loss': [], 'accuracy': [], 'confusion': [], 'errors': []}, 'train': {'loss': [], 'accuracy': [], 'confusion': []}, 'cv': {} } kf = StratifiedKFold(n_splits=FOLDS, shuffle=True) for i, (train_idx, test_idx) in enumerate(kf.split(X, y)): print("[{}] {}/{}".format(clf_type, i+1, FOLDS)) # split training and test sets X_train = X[train_idx] X_test = X[test_idx] y_train = y[train_idx] y_test = y[test_idx] # fit model model.fit(X_train, y_train) # save the best parameters from the inner-fold cross validation best_params = model.best_estimator_.get_params() for p in sorted(params.keys()): results['cv'][p] = best_params[p] # make predictions on train and test set y_test_pred = model.predict(X_test) y_train_pred = model.predict(X_train) # record some misclassified sentences idx_errors = np.where(y_test_pred != y_test)[0] np.random.shuffle(idx_errors) errors = X_test[idx_errors[:5]] results['test']['errors'].extend(errors) # store results results['test']['loss'].append(log_loss(y_test, y_test_pred)) results['test']['accuracy'].append(accuracy_score(y_test, y_test_pred)) results['test']['confusion'].append(confusion_matrix(y_test, y_test_pred)) results['train']['loss'].append(log_loss(y_train, y_train_pred)) results['train']['accuracy'].append(accuracy_score(y_train, y_train_pred)) results['train']['confusion'].append(confusion_matrix(y_train, y_train_pred)) return(results)
from scorer_semeval18 import main as eval tokenized_tweets = pickle.load(open(TOK_TWEETS_PATH, 'rb')) print('loaded tweets') data_matrix = construct_data_matrix(tokenized_tweets) print('constructed data matrix') print('Dim:', data_matrix.shape) print('Density:', np.count_nonzero(data_matrix) / np.size(data_matrix)) labels = np.asarray(open(CLEAN_LABELS_PATH).read().splitlines()) data_train, data_test, labels_train, labels_test = split_data( data_matrix, labels) print('split data') bern = BernoulliNB() bern.fit(data_train, labels_train) print("\nbern", bern.score(data_test, labels_test)) eval(labels_test, bern.predict(data_test)) multi = MultinomialNB() multi.fit(data_train + abs(np.min(data_train)), labels_train) print("\nmulti", multi.score(data_test + abs(np.min(data_test)), labels_test)) eval(labels_test, multi.predict(data_test)) tree = DecisionTreeClassifier(max_depth=10) tree.fit(data_train, labels_train) print("\ntree", tree.score(data_test, labels_test)) eval(labels_test, tree.predict(data_test)) clf = RandomForestClassifier(max_depth=3)
# [5] Результат в процентах from sklearn.naive_bayes import BernoulliNB from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") results = [] for clf, name in [ (BernoulliNB(alpha=0.4), 'Native Bayes'), (LinearSVC(C=9), 'SVC'), ( DecisionTreeClassifier(max_depth=26), 'DecisionTreeClassifier', ), # (LogisticRegression(C=12), 'LogisticRegression'), # (RandomForestClassifier(max_depth=2, random_state=0), 'RandomForest'), (KNeighborsClassifier(n_neighbors=13), 'KNN') ]: # Y_train.reshape(Y_train.shape[0],) # Y_test.reshape(Y_test.shape[0]) clf.fit(X_train, Y_train) predictions = clf.predict(X_train) training_accuracy = accuracy_score(predictions, Y_train)
votes.append(v) return str(mode(votes)[0]) def confidence(self, features): votes =[] for c in self._classifiers: v = c.predict(features) votes.append(v) choice_votes = int(mode(votes)[1]) conf = choice_votes / len(votes) return conf #def test_accuracy(self, x2,x3,x4,x5,x6, x7): # average = mean([x2,x3,x4,x5,x6, x7]) # return average BNB = BernoulliNB() BNB.fit(tfidf_train, y_train) pred = BNB.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) x2 = metrics.accuracy_score(y_test, pred) print("BernoulliNB Naive Bayes Accuracy: %0.3f" % score) #cm = metrics.confusion_matrix(y_test, pred, labels=[0,1]) #plot_confusion_matrix(cm, classes=[0, 1]) save_classifier = open("Pickled/BernoulliNB.pickle", "wb") pickle.dump(BNB, save_classifier) save_classifier.close() LR = LogisticRegression() LR.fit(tfidf_train, y_train) pred = LR.predict(tfidf_test)
def main(): show_plots = False #set to True to show plots, False to not show plots #read categories from arguments. e.g. "python3 test.py Comedy Drama Documentary Horror" categories = [] for arg in sys.argv[1:]: categories.append(arg) X, y, files_used = read_files(categories) try: high_info_words = high_information_words(X, y) X_high_info = [] for bag in X: new_bag = [] for words in bag: if words in high_info_words: new_bag.append(words) X_high_info.append(new_bag) except ZeroDivisionError: print("Not enough information too get high information words, please try again with more files.", file=sys.stderr) X_high_info = X X_wpm = wpm(files_used, categories, show_plots) X_dpm = dpm(files_used, categories, show_plots) X_wd = word_distribution(files_used, categories) doc2vec_model = Doc2Vec.load("d2v_150.model") #doc2vec_model = Doc2Vec.load("d2v_400.model") #Reason I don't infer the vector is that I used the data already while training the vector model (with tagged docoments), so I can just retrieve the data X_d2v = [doc2vec_model.docvecs[str(i)] for i in range(len(X))] #X_d2v = [doc2vec_model.infer_vector(to_list(str(i))) for i in X] X = [(str(x), str(x_high), wpm, dpm, wd, d2v) for x, x_high, wpm, dpm, wd, d2v in zip(X, X_high_info, X_wpm, X_dpm, X_wd, X_d2v)] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10) clfs = [ SVC(C=10, cache_size=500, class_weight=None, coef0=0.0, #parameters found using grid_search.py decision_function_shape=None, degree=3, gamma=0.0001, kernel='linear', max_iter=100000, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), MultinomialNB(alpha=1.0), BernoulliNB(), ] pipeline = Pipeline([ # Extract the features ('features', FeaturesExtractor()), # Use FeatureUnion to combine the features from subject and body ('union', FeatureUnion( transformer_list=[ #Pipeline bag-of-words model ('text', Pipeline([ ('selector', ItemSelector(key='text')), ('tfidf', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3))), #('chi-square', SelectKBest(chi2, 300)), ])), #Pipeline for high info words bag-of-words model ('text_high', Pipeline([ ('selector', ItemSelector(key='text_high')), ('tfidf', TfidfVectorizer(sublinear_tf=True, norm='l2')), ])), #Pipeline for wpm feature ('wpm', Pipeline([ ('selector', ItemSelector(key='wpm')), ('scaler', MinMaxScaler()), ])), #Pipeline for dpm feature ('dpm', Pipeline([ ('selector', ItemSelector(key='dpm')), ('scaler', MinMaxScaler()), ])), #Pipeline for wd feature ('wd', Pipeline([ ('selector', ItemSelector(key='wd')), ('scaler', MinMaxScaler()), ])), #Pipeline for d2v feature ('d2v', Pipeline([ ('selector', ItemSelector(key='d2v')), ('scaler', MinMaxScaler()), ])), #Pipeline for POS tag features # ('pos', Pipeline([ # ('selector', ItemSelector(key='pos')), # ('words', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3))) # ])), ], # weight components in FeatureUnion transformer_weights={ 'text': 0.2, 'text_high' : 1, 'wpm': 0, 'dpm': 0.2, 'wd': 0, 'd2v': 0, #'pos': 0, }, )), # Use a classifier on the combined features ('classifier', clfs[0]), ]) train(pipeline, X_train, y_train, categories, show_plots) final_pred = pipeline.predict(X_test) print("\nScores on test set:\n") print(metrics.accuracy_score(y_test, final_pred)) print(metrics.classification_report(y_test, final_pred, digits=3)) confusion_m = metrics.confusion_matrix(y_test, final_pred, labels=categories) plt.figure(figsize = (16, 9), dpi=150) sn.set(font_scale=1.4) #label size hm = sn.heatmap(confusion_m, annot=True, fmt='g', annot_kws={"size": 16}) #font size hm.set(xticklabels = categories, yticklabels = categories) plt.title(str(pipeline.named_steps['classifier']).split("(")[0] + ' Confusion Matrix') if show_plots: plt.show() hm.figure.savefig(str(pipeline.named_steps['classifier']).split("(")[0] + '_confusion_matrix_test' + '.png', figsize = (16, 9), dpi=150) plt.close()
#Naive Bayes Model from sklearn.model_selection import train_test_split quora_train, cv = train_test_split(quora_train, test_size=0.2) x_train = quora_train.drop(['target'], axis=1) y_train = quora_train['target'] x_cv = cv.drop(['target'], axis=1) y_cv = cv['target'] from sklearn.feature_extraction.text import TfidfVectorizer tf_idf_vect = TfidfVectorizer() reviews_tfidf = tf_idf_vect.fit_transform(x_train['question_text'].values) reviews_tfidf1 = tf_idf_vect.transform(x_cv['question_text'].values) reviews_tfidf2 = tf_idf_vect.transform(quora_test['question_text'].values) from sklearn.model_selection import GridSearchCV from sklearn.naive_bayes import BernoulliNB nb = BernoulliNB() param_grid = { 'alpha': [1000, 100, 10, 1, 0.1, 0.01, 0.001] } #params we need to try on classifier gsv = GridSearchCV(nb, param_grid, cv=2, verbose=1, n_jobs=-1, scoring='f1') gsv.fit(reviews_tfidf, y_train) nb = BernoulliNB(alpha=0.1) nb.fit(reviews_tfidf, y_train) train_pred = nb.predict(reviews_tfidf) cv_pred = nb.predict(reviews_tfidf1) test_pred = nb.predict(reviews_tfidf2) print("Train Set Accuracy: {}".format(accuracy_score(train_pred, y_train))) print("Train Set ROC: {}".format(roc_auc_score(train_pred, y_train))) print("Train Set F1 Score: {}\n".format(f1_score(train_pred, y_train))) print("Validation Set Accuracy: {}".format(accuracy_score(cv_pred, y_cv)))
print Matr.shape Matr=Matr[1:] print len(Yval) a=1000 b=100000 prior1=(a+spamc-1)*1.0/(a+b+spamc+legitc-2) prior2=(a+legitc-1)*1.0/(a+b+spamc+legitc-2) # y=beta.pdf(x, a, b) from sklearn.metrics import precision_recall_curve from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import BernoulliNB from sklearn.cross_validation import train_test_split a_train, a_test, b_train, b_test = train_test_split(Matr, Yval, test_size=0.2, random_state=42) clf = MultinomialNB(class_prior=[1,2]) clf2= BernoulliNB(class_prior=[prior1,prior2]) clf.fit(a_train, b_train) clf2.fit(a_train, b_train) Ax=clf.predict(a_test) Bx=clf2.predict(a_test) from sklearn.metrics import f1_score #print f1_score(b_test, Ax, average='macro') print f1_score(b_test, Bx, average='macro') import matplotlib.pyplot as plt precision, recall, _ = precision_recall_curve(b_test, Bx) plt.step(recall, precision, color='b', alpha=0.2,where='post') plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
def main(): x = [0, 1, 2, 3, 4, 5] LABELS = [ 'simple_nb', 'svm', 'KNN', 'gausian_nb', 'bernoulli', 'random_forest' ] plt.title("Accuracy of different algorithm on different user chat") plt.xlabel("Algorithms used") plt.ylabel("Accuracy") path = './chats_process' #test_negative = convert_float(test_nega) #labels_test_negative = get_labels(test_negative) count = 0 results = [0, 0, 0, 0, 0, 0] for filename in os.listdir(path): count += 1 #print filename t = path + '/' + filename + '/train.csv' splitRatio = .5 dataset = loadCsv(t) trainingSet, testSet = splitDataset(dataset, splitRatio) #testSet = testSet + test_nega trainset_copy = trainingSet test_copy = testSet trainingSet = convert_float(trainingSet) testSet = convert_float(testSet) #print testSet summaries = summarizeByClass(trainingSet) predictions = getPredictions(summaries, testSet) acc_NB = getAccuracy1(testSet, predictions) #print "accuracy_simpleNB= " + str(acc_NB) results[0] += acc_NB train_set = convert_float(trainset_copy) labels_train = get_labels(trainset_copy) test_set = convert_float(test_copy) #testSet = testSet + test_negative labels_test = get_labels(test_copy) #labels_test = labels_test + labels_test_negative #print labels_test tp_NB = TruePositive(predictions, testSet) tn_NB = TrueNegative(predictions, testSet) fp_NB = FalsePositive(predictions, testSet) fn_NB = FalseNegative(predictions, testSet) prec_NB = tp_NB / (tp_NB + fp_NB) rec_NB = tp_NB / (tp_NB + fn_NB) # SVM clf = svm.SVC(probability=True) clf.fit(train_set, labels_train) #clf.decision_function(test_set) results_SVM = clf.predict(test_set) a = clf.predict_proba(test_set) acc_svm = getAccuracy(results_SVM, labels_test) #print "accuracy_svm= " + str(acc_svm) results[1] += acc_svm tp_SVM = TruePositive(results_SVM, labels_test) tn_SVM = TrueNegative(results_SVM, labels_test) fp_SVM = FalsePositive(results_SVM, labels_test) fn_SVM = FalseNegative(results_SVM, labels_test) prec_SVM = tp_SVM / (tp_SVM + fp_SVM) rec_SVM = tp_SVM / (tp_SVM + fn_SVM) #KNN neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(train_set, labels_train) results_KNN = neigh.predict(test_set) b = neigh.predict_proba(test_set) acc_knn = getAccuracy(results_KNN, labels_test) #print "accuracy_knn= " + str(acc_knn) results[2] += acc_knn tp_knn = TruePositive(results_KNN, labels_test) tn_knn = TrueNegative(results_KNN, labels_test) fp_knn = FalsePositive(results_KNN, labels_test) fn_knn = FalseNegative(results_KNN, labels_test) prec_knn = tp_knn / (tp_knn + fp_knn) rec_knn = tp_knn / (tp_knn + fn_knn) #gausianNB clf = GaussianNB() clf.fit(train_set, labels_train) results_GausianNB = clf.predict(test_set) c = clf.predict_proba(test_set) acc_gausNB = getAccuracy(results_GausianNB, labels_test) #print "accuracy_gausNB= " + str(acc_gausNB) results[3] += acc_gausNB tp_gnb = TruePositive(results_GausianNB, labels_test) tn_gnb = TrueNegative(results_GausianNB, labels_test) fp_gnb = FalsePositive(results_GausianNB, labels_test) fn_gnb = FalseNegative(results_GausianNB, labels_test) prec_gnb = tp_gnb / (tp_gnb + fp_gnb) rec_gnb = tp_gnb / (tp_gnb + fn_gnb) #BernoiliNB clf = BernoulliNB() clf.fit(train_set, labels_train) results_BernoulliNB = clf.predict(test_set) d = clf.predict_proba(test_set) acc_BernoNB = getAccuracy(results_BernoulliNB, labels_test) #print "accuracy_bernoNB= " + str(acc_BernoNB) results[4] += acc_BernoNB tp_gnb = TruePositive(results_BernoulliNB, labels_test) tn_gnb = TrueNegative(results_BernoulliNB, labels_test) fp_gnb = FalsePositive(results_BernoulliNB, labels_test) fn_gnb = FalseNegative(results_BernoulliNB, labels_test) prec_bnb = tp_gnb / (tp_gnb + fp_gnb) rec_bnb = tp_gnb / (tp_gnb + fn_gnb) #randomforests clf = RandomForestClassifier(n_estimators=10) clf.fit(train_set, labels_train) results_randomforest = clf.predict(test_set) e = clf.predict_proba(test_set) acc_random_F = getAccuracy(results_randomforest, labels_test) #print "accuracy_random_forest= " + str(acc_random_F) results[5] += acc_random_F tp_gnb = TruePositive(results_randomforest, labels_test) tn_gnb = TrueNegative(results_randomforest, labels_test) fp_gnb = FalsePositive(results_randomforest, labels_test) fn_gnb = FalseNegative(results_randomforest, labels_test) prec_rf = tp_gnb / (tp_gnb + fp_gnb) rec_rf = tp_gnb / (tp_gnb + fn_gnb) #print "-------------\n" #print results_SVM #print results_KNN #print results_GausianNB #print results_BernoulliNB #print results_randomforest #print "\n" #print labels_test #print results #plt.plot(x,results,marker='o') ''' s = open('results.txt','a') with open('./chats_process/'+filename+'/'+'ml_training_'+'.csv', 'w') as csvoutput: writer = csv.writer(csvoutput) for a1,b1,c1,d1,e1,label in zip(a,b,c,d,e,labels_test): writer.writerow([a1[1],b1[1],c1[1],d1[1],e1[1],label]) s.write("%s\n" % a1) s.write("%s\n" % b1) s.write("%s\n" % c1) s.write("%s\n" % d1) s.write("%s\n" % e1) #s.write(b1) #s.write(str(c1)) #s.write(d1) #s.write(e1) s.write("................\n") print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet)) # prepare model summaries = summarizeByClass(trainingSet) # test model predictions = getPredictions(summaries, testSet) accuracy = getAccuracy(testSet, predictions) print('Accuracy: {0}%').format(accuracy) ''' t = open('remove_one5.txt', 'a') t.write(str(prec_NB) + " , " + str(rec_NB) + '\n') t.write(str(prec_SVM) + " , " + str(rec_SVM) + '\n') t.write(str(prec_gnb) + " , " + str(rec_gnb) + '\n') t.write(str(prec_bnb) + " , " + str(rec_bnb) + '\n') t.write(str(prec_rf) + " , " + str(rec_rf) + '\n') t.write(str(prec_knn) + " , " + str(rec_knn) + '\n')
with open('Kfold_acc.pickle', 'wb') as f: pickle.dump(alternative_Kfold_mean, f) #support vector machine from sklearn.svm import LinearSVC SVM = LinearSVC(random_state=123) SVM.fit(X_train,y_train) #decision tree from sklearn.tree import DecisionTreeClassifier DT = DecisionTreeClassifier(random_state=123) DT.fit(X_train,y_train) #naive bayes from sklearn.naive_bayes import BernoulliNB NB = BernoulliNB() NB.fit(X_train,y_train) y_pred_log = Log_Reg.predict(X_test) y_pred_svm = SVM.predict(X_test) y_pred_DT = DT.predict(X_test) y_pred_NB = NB.predict(X_test) ### validation score ## ## 10-fold cross validation ## from sklearn.model_selection import cross_val_score cross_val = (cross_val_score(Log_reg_fitted, X_train, y_train, cv=10)) alternative_Kfold_mean = np.mean(cross_val) print('Average validation score Log Reg: ',alternative_Kfold_mean,'\n', 'Validation score per fold: ','\n',cross_val) ### rest of the classifiers' K-fold validation scores ###
def train(self, with_trees, with_print): # if fetch_from_server: # self.fetch_tweets(with_print=with_print,pth=pth,remove_stopwords=remove_stopwords,ngrams=ngrams,n_min=n_min,n_max=n_max) # else: # self.train_test_split(with_print) # Logistic Regression print( '------------------------------------------------------------------------\n', 'Logistic Regression:') start_clf_time = time.time() LogisticRegression_classifier = LogisticRegression(fit_intercept=True) LogisticRegression_classifier.fit(X=self.X_train, y=self.y_train) output = Kappa(LogisticRegression_classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\LogisticRegression.pickle", "wb") as classifier_f: pickle.dump(LogisticRegression_classifier, classifier_f) classifier_f.close() print( '------------------------------------------------------------------------\n', 'Naive Bayes:') start_clf_time = time.time() Naivebayes_classifier = GaussianNB() Naivebayes_classifier.fit(X=self.X_train, y=self.y_train) output = Kappa(Naivebayes_classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) # Naivebayes_classifier.show_most_informative_features(15) with open( getcwd() + "\\classifiers\\words_as_features\\Naivebayes_classifier.pickle", "wb") as classifier_f: pickle.dump(Naivebayes_classifier, classifier_f) classifier_f.close() print( '------------------------------------------------------------------------\n', 'Multinomial Naive Bayes:') start_clf_time = time.time() MNB_classifier = MultinomialNB() MNB_classifier.fit(X=self.X_train, y=self.y_train) output = Kappa(MNB_classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\MNB_classifier.pickle", "wb") as classifier_f: pickle.dump(MNB_classifier, classifier_f) classifier_f.close() print( '------------------------------------------------------------------------\n', 'Bernoulli Naive Bayes:') start_clf_time = time.time() BernoulliNB_classifier = BernoulliNB() BernoulliNB_classifier.fit(X=self.X_train, y=self.y_train) output = Kappa(BernoulliNB_classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\BernoulliNB_classifier.pickle", "wb") as classifier_f: pickle.dump(BernoulliNB_classifier, classifier_f) classifier_f.close() ''' ================================================================================================================================================ ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ================================================================================================================================================ ''' print( '------------------------------------------------------------------------\n', 'C-Support Vector Machine:') print('======================\n', 'Linear Kernel') start_clf_time = time.time() SVC_lin_classifier = SVC(kernel='linear') SVC_lin_classifier.fit(X=self.X_prep_train, y=self.y_train) output = Kappa(SVC_lin_classifier, X_test=self.X_prep_test, y_test=self.y_test).output output['Kernel'] = 'linear' output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\SVC_lin.pickle", "wb") as classifier_f: pickle.dump(SVC_lin_classifier, classifier_f) classifier_f.close() print('======================\n', 'Polynomial Kernel') start_clf_time = time.time() SVC_poly_classifier = SVC(kernel='poly', C=1, gamma=1) SVC_poly_classifier.fit(X=self.X_prep_train, y=self.y_train) output = Kappa(SVC_poly_classifier, X_test=self.X_prep_test, y_test=self.y_test).output output['Kernel'] = 'poly' output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\SVC_poly.pickle", "wb") as classifier_f: pickle.dump(SVC_poly_classifier, classifier_f) classifier_f.close() # Also default kernel print('======================\n', 'Radial Basis Function Kernel') start_clf_time = time.time() SVC_classifier = SVC(kernel='rbf', gamma=0.1, C=1.38) SVC_classifier.fit(X=self.X_prep_train, y=self.y_train) output = Kappa(SVC_classifier, X_test=self.X_prep_test, y_test=self.y_test).output output['Kernel'] = 'rbf' output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\SVC_rbf.pickle", "wb") as classifier_f: pickle.dump(SVC_classifier, classifier_f) classifier_f.close() print('======================\n', 'Sigmoid Kernel') start_clf_time = time.time() SVC_sig_classifier = SVC(kernel='sigmoid', gamma=10) SVC_sig_classifier.fit(X=self.X_prep_train, y=self.y_train) output = Kappa(SVC_sig_classifier, X_test=self.X_prep_test, y_test=self.y_test).output output['Kernel'] = 'sigmoid' output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\SVC_sigmoid.pickle", "wb") as classifier_f: pickle.dump(SVC_sig_classifier, classifier_f) classifier_f.close() ''' ================================================================================================================================================ ''' print( '------------------------------------------------------------------------\n', 'Stochastic Gradient Descent:') start_clf_time = time.time() SGD_classifier = SGDClassifier() SGD_classifier.fit(X=self.X_train, y=self.y_train) output = Kappa(SGD_classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\SGD_classifier.pickle", "wb") as classifier_f: pickle.dump(SGD_classifier, classifier_f) classifier_f.close() print( '------------------------------------------------------------------------\n', 'Multi-layer Perceptron:') start_clf_time = time.time() MLP_Classifier = MLPClassifier(alpha=1) MLP_Classifier.fit(X=self.X_train, y=self.y_train) output = Kappa(MLP_Classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\MLP_Classifier.pickle", "wb") as classifier_f: pickle.dump(SGD_classifier, classifier_f) classifier_f.close() ''' Apart from training the forest classifier, both .dot and .png files are created with visual represntation of the trees ''' print( '------------------------------------------------------------------------\n', 'Random Forest:') start_clf_time = time.time() rnd_forest = RandomForestClassifier(n_jobs=-1, n_estimators=25, warm_start=True, max_features=7) RandomForest_Classifier = rnd_forest RandomForest_Classifier.fit(X=self.X_train, y=self.y_train) if with_trees: # Export trees i_tree = 0 for tree_in_forest in rnd_forest.estimators_: tree_dot_str = getcwd() + '/trees/tree_' + str(i_tree) + '.dot' with open(tree_dot_str, 'w') as tree_dot_file: tree_dot_file = tree.export_graphviz( tree_in_forest, out_file=tree_dot_file) (graph, ) = pydot.graph_from_dot_file(tree_dot_str) graph.write_png(tree_dot_str.replace('.dot', '.png')) i_tree = i_tree + 1 output = Kappa(RandomForest_Classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\RandomForest_Classifier.pickle", "wb") as classifier_f: pickle.dump(SGD_classifier, classifier_f) classifier_f.close() print( '------------------------------------------------------------------------\n', 'Adaptive Boosting:') start_clf_time = time.time() AdaBoost_Classifier = AdaBoostClassifier() AdaBoost_Classifier.fit(X=self.X_train, y=self.y_train) output = Kappa(AdaBoost_Classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) with open( getcwd() + "\\classifiers\\words_as_features\\AdaBoost_Classifier.pickle", "wb") as classifier_f: pickle.dump(SGD_classifier, classifier_f) classifier_f.close() print( '------------------------------------------------------------------------\n', 'Voted Classifier:') start_clf_time = time.time() voted_classifier = VoteClassifier( Naivebayes_classifier, # SVR_classifier, MLP_Classifier, RandomForest_Classifier, # QDA_Classifier, AdaBoost_Classifier, SVC_lin_classifier, # SVC_poly_classifier, SVC_sig_classifier, SVC_classifier, SGD_classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier) with open( getcwd() + "\\classifiers\\words_as_features\\voted_classifier.pickle", "wb") as classifier_f: pickle.dump(SGD_classifier, classifier_f) classifier_f.close() output = Kappa(voted_classifier, X_test=self.X_test, y_test=self.y_test).output output['duration'] = round(time.time() - start_clf_time, 3) output['time_stamp'] = datetime.datetime.now().strftime( "%Y_%m_%d_%H:%M:%S") self.output_log = self.output_log.append(output) print( '------------------------------------------------------------------------' ) self.output_log['Train_News'] = self.sizes_df.loc['Training']['News'] self.output_log['Train_Spam'] = self.sizes_df.loc['Training'][ 'Not-News'] self.output_log['Test_News'] = self.sizes_df.loc['Testing']['News'] self.output_log['Test_Spam'] = self.sizes_df.loc['Testing']['Not-News'] self.output_log['feature_cnt'] = None self.output_log['type'] = 'descriptive_features' # Reorder ouput log self.output_log = self.output_log[[ # ID 'time_stamp', 'Name', 'Kernel', 'feature_cnt', 'type', # Sizes 'Train_News', 'Train_Spam', 'Test_News', 'Test_Spam', 'True_News', 'True_Spam', 'False_News', 'False_Spam', # Measures 'Accuracy', 'Kappa', 'rauc', 'duration', 'News_TPR', 'News_FPR', 'News_Prec', 'News_Recall', 'News_F1', 'Spam_TPR', 'Spam_FPR', 'Spam_Prec', 'Spam_Recall', 'Spam_F1', ]] # Saving results to file df = pd.DataFrame() if os.path.isfile( getcwd() + "\\classifiers\\words_as_features\\desc_weighted_confs.csv"): retry = 5 while retry > 0: try: df = pd.DataFrame().from_csv( getcwd() + "\\classifiers\\words_as_features\\desc_weighted_confs.csv", sep=";") except Exception as e: retry -= 1 time.sleep(60) print('Error reading file.', retry, 'attempts remainig ...') continue break df = self.output_log.append(df, ignore_index=True) else: df = self.output_log retry = 5 while retry > 0: try: df.to_csv( getcwd() + "\\classifiers\\words_as_features\\desc_weighted_confs.csv", sep=";") print( 'saved to', getcwd() + "\\classifiers\\words_as_features\\desc_weighted_confs.csv" ) except Exception as e: retry -= 1 time.sleep(60) print('Error writing to file.', retry, 'attempts remainig ...') continue break
# Each vector has the length of the entire vocabulary and # an integer count for the number of times each word appeared in the document. myPattern = r'[a-z]{4,}' if token_pattern else r'(?u)\b\w\w+\b' vectorizer = CountVectorizer(stop_words=stop_words, max_df=max_df, min_df=min_df, token_pattern=myPattern) counts = vectorizer.fit_transform(X_train) # Create classifier and fit for multinomial model. clfMulti = MultinomialNB() clfMulti.fit(counts, Y_train) # Create classifier and fit for bernoulli model clfBernoulli = BernoulliNB(binarize=1) clfBernoulli.fit(counts, Y_train) X_test = df_test.text Y_test = df_test.label # Transforms each document into a vector (with length of vocabulary of train documents) with an # integer count for the number of times each word appeared in the document example_count = vectorizer.transform(X_test) # Predict labels on the test data set predictionsMulti = clfMulti.predict(example_count) predictionsBernoulli = clfBernoulli.predict(example_count) def getPercentageCorrect(predictions):
svm_cv.fit(train_X, train_y) print(svm_cv.best_params__) print(svm_cv.cv_results__)""" gamma_best = 1.0#svm_cv.best_params__["gamma"] # final experiments (e.g., to get standard error) numruns = 8 # try a neural network since svm can take too long to converge nn = MLPClassifier(hidden_layer_sizes = (16, 8), alpha = 0.0, max_iter = 10, random_state = None) final_algs = { "Logistic Regression": LogisticRegression(penalty = "l1", solver = "saga", random_state = None, class_weight = "balanced", max_iter = 90, C = C_best), "SVM": SVC(kernel = "rbf", random_state = None, class_weight = "balanced", gamma = gamma_best, max_iter = 1000), "Naive Bayes": BernoulliNB(alpha = 1.0, fit_prior = True) #"Neural Network": nn } print("Starting final experiments") conf_mats = {} # holds the confusion matrices for each algorithm f1 = {} # holds the list of macro f1 scores for each algorithm for name in final_algs.keys(): conf_mats[name] = pd.DataFrame([[0, 0], [0, 0]]) f1[name] = [] # compute macro average of f1 score (i.e., f1 score for every run) so that we may calculate a confidence interval for i in range(numruns):
#####TRYING MIX OF ALL MODELS from sklearn.naive_bayes import BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier X = train_data.tweet y = train_data.label cv = ShuffleSplit(n_splits=20, test_size=0.2) models = [ MultinomialNB(), BernoulliNB(), LogisticRegression(), SGDClassifier(), LinearSVC(), RandomForestClassifier(), MLPClassifier() ] sm = SMOTE() # Init a dictionary for storing results of each run for each model results = { model.__class__.__name__: { 'accuracy': [], 'f1_score': [], 'confusion_matrix': []
train_label_fn = 'train-labels-100.txt' test_data_fn = 'test-features.txt' test_label_fn = 'test-labels.txt' (train_data, train_label) = read_data(train_data_fn, train_label_fn) (test_data, test_label) = read_data(test_data_fn, test_label_fn) clf = MultinomialNB() clf.fit(train_data, train_label) y_pred = clf.predict(test_data) print('Training size = %d, accuracy = %.2f%%' % \ (train_data.shape[0],accuracy_score(test_label, y_pred)*100)) train_data_fn = 'train-features-50.txt' train_label_fn = 'train-labels-50.txt' test_data_fn = 'test-features.txt' test_label_fn = 'test-labels.txt' (train_data, train_label) = read_data(train_data_fn, train_label_fn) (test_data, test_label) = read_data(test_data_fn, test_label_fn) clf = MultinomialNB() clf.fit(train_data, train_label) y_pred = clf.predict(test_data) print('Training size = %d, accuracy = %.2f%%' % \ (train_data.shape[0],accuracy_score(test_label, y_pred)*100)) clf = BernoulliNB(binarize = .5) clf.fit(train_data, train_label) y_pred = clf.predict(test_data) print('Training size = %d, accuracy = %.2f%%' % \ (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
def classification_naive_bayes(X, Y, nome): nb_model = BernoulliNB() classification_model_cv(X, Y, nb_model, "Naive Bayes "+nome)
vecCount = CountVectorizer(min_df=3) vecCount.fit(X_train["text"]) # 単語の種類 print("word size: ", len(vecCount.vocabulary_)) # 先頭5件の単語を表示 print("word content: ", dict(list(vecCount.vocabulary_.items())[0:5])) # トレーニング・評価データをベクトル化 X_train_vec = vecCount.transform(X_train["text"]) X_test_vec = vecCount.transform(X_test["text"]) # 先頭5件のベクトル化データを表示 print("先頭5件のベクトル化データを表示") print(pd.DataFrame(X_train_vec.toarray()[0:5], columns=vecCount.get_feature_names())) # -モデル作成- # ベルヌーイモデル model = BernoulliNB() model.fit(X_train_vec, Y_train["class"]) # -評価- print("Train accuracy = %.3f" % model.score(X_train_vec, Y_train)) print("Test accuracy = %.3f" % model.score(X_test_vec, Y_test)) # -予測- # 予測テキストデータ作成 data = np.array([ "I am happy.", "Are you happy? 00", "Free service! Please contact me immediately. But it is 300 US dollars next month." ]) df_data = pd.DataFrame(data, columns=["text"]) # 予測テキストデータをベクトル化
print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = f05_scorer(clf, X_test, y_test) if score > best_score: best_clf = clf best_score = score fout = open('kbest-multinomialNB.pickle','w') pickle.dump(clf,fout) fout.close() ####################### print "Bernoulli NB" clf = BernoulliNB(binarize = 0.0, alpha = 0.25, fit_prior = False) kf = KFold(72000, n_folds=10, shuffle=True) best_score = 0 best_clf = 0 for train_index, test_index in kf: print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) score = f05_scorer(clf, X_test, y_test) if score > best_score: best_clf = clf best_score = score fout = open('kbest-bernoulliNB.pickle','w')
def __init__(self, info, verbose=True, debug_mode=False, run_on_gpu=False): self.label_num = info['label_num'] self.target_num = info['target_num'] self.task = info['task'] self.metric = info['metric'] self.postprocessor = MultiLabelEnsemble( LogisticRegression(), balance=False) # To calibrate proba if debug_mode >= 2: self.name = "RandomPredictor" self.model = RandomPredictor(self.target_num) self.predict_method = self.model.predict_proba return if info['task'] == 'regression': if info['is_sparse'] == True: self.name = "BaggingRidgeRegressor" self.model = BaggingRegressor( base_estimator=Ridge(), n_estimators=1, verbose=verbose, random_state=1) # unfortunately, no warm start... # Lukasz uses BernoulliNB() instead of Ridge() else: #self.name = "GradientBoostingRegressor" #self.model = GradientBoostingRegressor(n_estimators=1, verbose=verbose, warm_start = True, random_state=1) # There is a problem with "GradientBoostingRegressor", which does not accept non c-contiguous arrays. self.name = "RandomForestRegressor" self.model = RandomForestRegressor(n_estimators=1, random_state=1, warm_start=True) self.predict_method = self.model.predict else: if info['has_categorical']: # Out of lazziness, we do not convert categorical variables... self.name = "RandomForestClassifier" self.model = RandomForestClassifier( n_estimators=1, verbose=verbose, random_state=1 ) # New: warm_start = True ,now there is warm start is sklearn 0.16.1 not in here for backward compatibility elif info['format'] == 'sparse_binary': self.name = "BaggingBernoulliNBClassifier" self.model = BaggingClassifier( base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose, random_state=1) # unfortunately, no warm start... elif info['format'] == 'sparse': self.name = "BaggingMutinomialNBClassifier" self.model = BaggingClassifier( base_estimator=MultinomialNB(), n_estimators=1, verbose=verbose, random_state=1) # unfortunately, no warm start... else: if info['label_num'] > 100: self.name = "BaggingGaussianNBClassifier" self.model = BaggingClassifier( base_estimator=GaussianNB(), n_estimators=1, verbose=verbose, random_state=1) # unfortunately, no warm start... else: #self.name = "RandomForestClassifier" #self.model = RandomForestClassifier(n_estimators=1, verbose=verbose, warm_start = True , random_state=1) # New: now there is warm start is sklearn 0.16.1 self.name = "GradientBoostingClassifier" self.model = GradientBoostingClassifier( n_estimators=1, verbose=verbose, random_state=1, min_samples_split=10, warm_start=False) # New bug warm start no longer works if info['task'] == 'multilabel.classification': self.model = MultiLabelEnsemble(self.model) self.predict_method = self.model.predict_proba
def training_step(data, vectorizer): training_text = data['Lyrics'] training_result = data['Year'] training_text = vectorizer.fit_transform(training_text) return BernoulliNB().fit(training_text, training_result)
SGD_clf = Pipeline([ ('vect', CountVectorizer(ngram_range=(1, 4))), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression()), ]) elif algo == "Perceptron" or algo == "perceptron": SGD_clf = Pipeline([ ('vect', CountVectorizer(ngram_range=(1, 4))), ('tfidf', TfidfTransformer()), ('clf', Perceptron()), ]) elif algo == "BernoulliNB" or algo == "bernoulliNB": SGD_clf = Pipeline([ ('vect', CountVectorizer(ngram_range=(1, 4))), ('tfidf', TfidfTransformer()), ('clf', BernoulliNB()), ]) elif algo == "SGDClassifier" or algo == "sgdClassifier": SGD_clf = Pipeline([ ('vect', CountVectorizer(ngram_range=(1, 4))), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # Fit model to training set SGD_clf.fit(X_train, y_train) # Predict on test set SVM_pred = SGD_clf.predict(X_test)
import pandas as pd import numpy as np from sklearn.model_selection import cross_val_score, cross_val_predict from sklearn import datasets from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import accuracy_score import random from sklearn.ensemble import VotingClassifier random.seed(2002) iris = datasets.load_iris() X = iris.data Y = iris.target tree = DecisionTreeClassifier() GNB = GaussianNB() BNB = BernoulliNB() vote = VotingClassifier(estimators=[('tree', tree), ('Gnb', GNB), ('Bnb', BNB)], weights=[2, 1, 1]) vote.fit(X, Y) pred = vote.predict(X) print(accuracy_score(Y, pred))
tweet_data = train_data['tweet_text'] topic_data = topic_analysis(train_data) count = CountVectorizer(token_pattern=r'[a-zA-Z0-9#@%_$]+[a-zA-Z0-9#@%_$]+', lowercase=False) bag_of_words = count.fit_transform(tweet_data) bag_of_words_2 = count.transform(test_data['tweet_text']) X = bag_of_words.toarray() Y = np.array(topic_data) x_train = X x_test = bag_of_words_2.toarray() y_train = Y from sklearn.naive_bayes import BernoulliNB #from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report clf = BernoulliNB() model = clf.fit(x_train, y_train) predictions = model.predict(x_test) instance = test_data['instance_number'] dic = OrderedDict() for i in range(len(instance)): dic[instance[i]] = predictions[i] for k, v in dic.items(): print(str(k) + ' ' + str(v))
classifiers.append(GradientBoostingClassifier(random_state=random_state)) classifiers.append(RandomForestClassifier(random_state=random_state)) #Gaussian process classifiers.append(GaussianProcessClassifier(random_state=random_state)) #Generalized linear models classifiers.append(LogisticRegressionCV(random_state=random_state)) classifiers.append(PassiveAggressiveClassifier(random_state=random_state)) classifiers.append(RidgeClassifierCV()) classifiers.append(SGDClassifier(random_state=random_state)) classifiers.append(Perceptron(random_state=random_state)) classifiers.append(MLPClassifier(random_state=random_state)) #Navies Bayes classifiers.append(BernoulliNB()) classifiers.append(GaussianNB()) #Nearest Neighbors classifiers.append(KNeighborsClassifier()) #Discrimnant analysis classifiers.append(LinearDiscriminantAnalysis()) #Support vector machine classifiers.append(SVC(random_state=random_state, probability=True)) classifiers.append(NuSVC(random_state=random_state, probability=True)) classifiers.append(LinearSVC(random_state=random_state)) #Trees classifiers.append(DecisionTreeClassifier(random_state=random_state))