def naive_bayes(x_value, y_value): X = x_value y = y_value #train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123) vect = CountVectorizer() vect.fit(X_train) X_train_dtm = vect.transform(X_train) X_test_dtm = vect.transform(X_test) from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred_class = nb.predict(X_test_dtm) print 'Accuracy: ' print metrics.accuracy_score(y_test, y_pred_class) print 'Null Accuracy: ' print y_test.value_counts().head(1) / len(y_test) print 'Confusion Matrix: ' print metrics.confusion_matrix(y_test, y_pred_class)
def main(): """loads data, trains model, tests model Inputs: file: binary file containing sparse numpy array with text features file: binary file containing pandas dataframe with training labels Outs: print: classification report of classifier performance """ # Load training labels and text features chdir("../pickles") with open("word_counts.pkl", "rb") as f: X = pickle.load(f) with open("training_labels.pkl", "rb") as f: y = pickle.load(f) y = np.ravel(y["sponsored"]) # Create train and test splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Create and train model clf = MultinomialNB() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred))
def run_analyzer(data_file): start_time = time.time() with open(data_file, 'r') as f: data = pickle.load(f) labels = data['labels'] features = data['features'] #split into training and test data training_features, test_features, training_labels, test_labels = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=0) clf = svm.SVC() clf.fit(training_features, training_labels) clf = MultinomialNB().fit(training_features, training_labels) print "number of training samples %d" %len(training_labels) print "number of test samples: %d" %len(test_labels) print "number of features: %d" %training_features.shape[1] print "score on the training data: %.2f: " %clf.score(training_features, training_labels) predictions = clf.predict(test_features) predictions = map(float, predictions) test_labels = map(float, test_labels) test_labels = np.array(test_labels) succes_rate = np.mean(predictions == test_labels) print "results fitting on test data:" print "succes rate: %s" %succes_rate print "Runtime : %.2f seconds" % (time.time() - start_time) ##SCRIPT #run_analyzer(DATA_FILE_2) #cross_val(DATA_FILE) #cross_val(DATA_FILE_2) #search_parameters(DATA_FILE_2)
def train(self): ''' ## -- How to predict -- ## query = "blah blah" q = list2vec(hashit(q)) clf2 = joblib.load('nb') print(clf2.predict(q)) # <--- returns type id ''' limit = self.comment_limit sqls = ["SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=1 ORDER BY time DESC LIMIT " + str(limit), "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=2 ORDER BY time DESC LIMIT " + str(limit), "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=3 ORDER BY time DESC LIMIT " + str(limit)] print "training model" comments = self.sql2list(sqls) x, y = self.featureMatrix(comments) X = list2Vec(x) Y = list2Vec(y) q = "Let's talk about food." q_vec = list2Vec(hashit(q)) ## Precicting print "Classifying" clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) clf.fit(X, Y) joblib.dump(clf, self.path, compress=9)
def crossValidate(X_dataset,y): #cross validate model num_folds = 5 kfold = cross_validation.StratifiedKFold(y, n_folds=num_folds, shuffle=True) # kfold=KFold(X.shape[0],n_folds=10, shuffle=True) avg_accuracy=0 avg_precision=0 avg_recall=0 print "----------- cross_validation k=5" for train,test in kfold: Xtrain,Xtest,ytrain,ytest=X_dataset[train],X_dataset[test],y[train],y[test] # clf=LinearSVC() clf=MultinomialNB(alpha=0.1) # clf=LDA() clf.fit(Xtrain.toarray(),ytrain) ypred=clf.predict(Xtest.toarray()) accuracy=metrics.accuracy_score(ytest,ypred) # print "accuracy = ", accuracy avg_accuracy+=accuracy precision = metrics.precision_score(ytest,ypred) # print("precision: %0.3f" % precision) avg_precision+=precision recall = metrics.recall_score(ytest,ypred) # print("recall: %0.3f" % recall) avg_recall+=recall print "Average accuracy : " , (avg_accuracy/num_folds) print "Average precision : " , (avg_precision/num_folds) print "Average recall : " , (avg_recall/num_folds)
def bag_of_words_probabilities(train_reviews, test_reviews): """ Implements a baseline bag-of-words classifier. Returns a dictionary mapping tuples (review_id, class) to the probability that that review belongs to that class. """ train_corpus = [] test_corpus = [] Y_train = [] for review_id in train_reviews: review = train_reviews[review_id] train_corpus.append(review["text"]) Y_train.append(review["rating"]) vectorizer = CountVectorizer(stop_words = 'english') X_train = vectorizer.fit_transform(train_corpus) for review_id in test_reviews: review = test_reviews[review_id] test_corpus.append(review["text"]) # clf = LinearSVC(class_weight = 'auto').fit(X_train, Y_train) # clf = LogisticRegression().fit(X_train, Y_train) clf = MultinomialNB().fit(X_train, Y_train) X_test = vectorizer.transform(test_corpus) Y_probability = clf.predict_proba(X_test) probability_dict = {} review_id_list = test_reviews.keys() for i in range(len(review_id_list)): probability_dict[review_id_list[i]] = Y_probability[i][1] return probability_dict
def MultinomialNBClassify_Proba(enrollment_id, trainData, trainLabel, testData): nbClf = MultinomialNB() # default alpha=1.0, Laplace smoothing # settinf alpha < 1 is called Lidstone smoothing nbClf.fit(trainData, ravel(trainLabel)) testLabel = nbClf.predict_proba(testData)[:,1] saveResult(enrollment_id, testLabel, 'Proba_sklearn_MultinomialNB_alpha=0.1_Result.csv') return testLabel
def main(): # extract reviews from tsv files labeled_training_data = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # 25,000 reviews test_data = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3) # 25, 000 reviews print "Creating BOW...."" " vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) trained_data_features = vectorizer.fit_transform(review_list) trained_data_features = trained_data_features.toarray() # convert to numpy array for faster processing print "Supervised Learning - Naive Bayes" nb_model = MultinomialNB(alpha = 0.01) nb_model = nb_model.fit(trained_data_features, labeled_training_data["sentiment"]) # using BOW as feaures and the given labels as repsonse variables print "---------------------------------" print " " print "Predicting on test data: " # BOW for test set test_data_features = vectorizer.transform(test_review_list) test_data_features = test_data_features.toarray() # use the trained forest to make predictions predictions = nb_model.predict(test_data_features) # prepare output submission file prediction_output = pd.DataFrame( data = {"id":test_data["id"], "sentiment":predictions} ) # create pandas dataframe prediction_output.to_csv("BOW_NB.csv", index=False, quoting=3)# write to csv file joblib.dump(vectorizer, 'bow_model.pkl') joblib.dump(nb_model, 'nb_bow_model.pkl')
def classify_reviews(): import featurizer import gen_training_data import numpy as np from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier data = gen_training_data.gen_data(); stemmed_data = featurizer.stem(data); tfidf= featurizer.tfidf(data); clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']); predicted = clf.predict(tfidf['test_tfidf']); num_wrong = 0; tot = 0; for expected, guessed in zip(data['testing_labels'], predicted): if(expected-guessed != 0): num_wrong += 1; print("num_wrong: %d",num_wrong) sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42); _ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']); sgd_pred = sgd_clf.predict(tfidf['test_tfidf']); print np.mean(sgd_pred == data['testing_labels']); stem_tfidf = featurizer.tfidf(stemmed_data); _ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']); sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']); print np.mean(sgd_stem_prd==data['testing_labels']);
def main(clf): #print 'getting train' train = pd.read_csv('dat/trainMN.tsv',sep = '\t') #print 'getting test' test = pd.read_csv('dat/devMN.tsv', sep = '\t') global all_words all_words = word_to_set(train['Phrase'], trim=20, is_raw=True) #print 'creating x dict vectors from train' train_x = train['Phrase'] #print 'extracting...' train_x = use_feature_dicts(train_x) # print train_x #print 'creating train y' train_y = [int(y) for y in train['Sentiment']] if clf == 'NB': classifier = MultinomialNB().fit(train_x, train_y) elif clf == 'RF': classifier = RandomForestClassifier().fit(train_x, train_y) elif clf == 'LG': classifier = linear_model.LinearRegression() classifier = classifier.fit(train_x, train_y) elif clf == 'SGD': classifier = SGDClassifier().fit(train_x, train_y) #print 'testing' test_x = use_feature_dicts(test['Phrase']) for i in classifier.predict(test_x): print i title = clf + '.pickle' pickle.dump(classifier, open(title, 'w'))
def naive_bayes(): nb = MultinomialNB() nb.fit(X_train, train_data.danger) nb_pred = nb.predict(X_test) nb_score = nb.score(X_test, y_test) precision, recall, _, _ = precision_recall_fscore_support(y_test, nb_pred) return precision, recall, str(nb_score)
class TrainNaiveBayes: def __init__(self, all_features, neu_labels): """ Trains a classifier using Naive Bayes """ self._num_features = len(all_features.values()[0]) self._X = numpy.zeros((1, self._num_features)) # Feature matrix self._Y = numpy.array([0]) # Label vector for user_id in neu_labels.keys(): self._X = numpy.append(self._X, [all_features[user_id]], axis=0) self._Y = numpy.append(self._Y, [neu_labels[user_id]]) self._X = numpy.delete(self._X, 0, 0) # Delete the first row (contains all 0s) self._Y = numpy.delete(self._Y, 0) print "Using MultinomialNB" self._model = MultinomialNB() print cross_validation.cross_val_score(self._model, self._X, self._Y, cv=10, scoring='f1') self._model.fit(self._X, self._Y) def predict(self, features): A = numpy.zeros((1, self._num_features)) for user_id in features.keys(): A = numpy.append(A, [features[user_id]], axis=0) A = numpy.delete(A, 0, 0) return self._model.predict(A)
def train(good_sources, bad_sources,method,naive_bayes=None,keywords=list()): #train the algorithm good_samples = find_keywords(' '.join([entry[method] for entry in good_sources])) bad_samples = find_keywords(' '.join([entry[method] for entry in bad_sources])) #if we have an exists knowledge base to append this new information to, do so if naive_bayes: new_kws = set(good_samples+bad_samples) print('Using old keywords as well') print("# old keywords = {}\n # new keywords = {}".format(len(keywords),len(new_kws))) new_kws = set(good_samples+bad_samples).difference(keywords) print("# fresh keywords = {}\n".format(len(new_kws))) #make some call to naive_bayes.partial_fssit in here X = np.concatenate((naive_bayes.feature_count_, np.zeros((naive_bayes.feature_count_.shape[0],len(new_kws)))),1) all_kw = keywords + list(new_kws) else: print('Only using keywords from this content set') all_kw = list(set(good_samples+bad_samples)) X = np.zeros((2,len(all_kw))) for j,kw in enumerate(all_kw): X[0,j] += good_samples.count(kw) X[1,j] += bad_samples.count(kw) y = ['good','bad'] naive_bayes = MultinomialNB() naive_bayes.fit(X,y) return naive_bayes, all_kw
def string_selection(): # get data vectorizer = CountVectorizer(decode_error='ignore') ch2 = SelectKBest(chi2, k=100) # get data train_data, permission_list = db_tool.get_new_train_data() x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'], train_data['target'], test_size=0.2, random_state=1) # feature extraction x_train = vectorizer.fit_transform(x_train) feature_names = vectorizer.get_feature_names() x_train = ch2.fit_transform(x_train, y_train) feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print(ch2.scores_) print(ch2.get_support(indices=True)) print(feature_names) x_test = vectorizer.transform(x_test) x_test = ch2.transform(x_test) # # build the model model = MultinomialNB().fit(x_train, y_train) # # # valid the model predicted = model.predict(x_test) print (metrics.accuracy_score(y_test, predicted))
def run_naivebayes_evaluation(self, inputdata, outputdata, k): """ Fit Naive Bayes Classification on train set with cross validation. Run Naive Bayes Classificaiton on test set. Return results """ ###print "** Fitting Naive Bayes classifier.." # Cross validation cv = cross_validation.KFold(inputdata.shape[0], n_folds=k, indices=True) cv_naivebayes = [] f1_scores = [] for traincv, testcv in cv: clf_cv = MultinomialNB() clf_cv.fit(inputdata[traincv], outputdata[traincv]) y_pred_cv = clf_cv.predict(inputdata[testcv]) f1 = metrics.f1_score(outputdata[testcv], y_pred_cv, pos_label=0) f1_scores.append(f1) #TODO: NEEDED? self.classifier = clf_cv print "score average: %s" + str(np.mean(f1_scores)) average_score =np.mean(f1_scores) tuples = (average_score, f1_scores) return (tuples, 'N.A.', 'N.A.')
def predict(cur, plyr_id, game_plyrs): #creates training set (called 'X') for plyr all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches) games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in n_cols = all_plyrs.shape[0] #int m_rows = games.shape[0] #int w = weights(games) zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe populate_training_set(cur, X, games, plyr_id) #print "X: ", X.values ###run coaches_model and then im here### #creates vector of known output values Y = training_output_vector(cur, games, plyr_id) #good #print "(len) Y: ", len(Y), Y test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe update_training_matrix(cur, game_plyrs, 0, test_X) #run Bernoulli NB Classifier nb_clf = MultinomialNB() if len(X.values) == 0: return 0 nb_clf.fit(X, Y, sample_weight=w) nb_predictions = nb_clf.predict(test_X) #print "test_X: ", test_X.values nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) avgs = [3,8,12.5,17,21,25] #print "probs: ", nb_norm_prob #print avgs ev = expected_val(nb_norm_prob, avgs) #can also calc dot product return round(ev,1)
class Sentiment: def __init__(self): self.stop_words = stopwords.words() + list(string.punctuation) self.tfid = TfidfVectorizer() self.clf = MultinomialNB() # score: 0.7225 # self.clf = SVC() # create pipelines # clean the input def fit(self, X, Y): self.X = X self.Y = Y # give the subset of dataset to be trained l = 0 h = 4000 words = [word_tokenize(x.decode("utf-8").lower()) for x in X[l:h]] processed_words = [" ".join(w for w in s if w not in self.stop_words) for s in words] X_train = self.tfid.fit_transform(processed_words) Y_train = Y[l:h] self.clf.fit(X_train, Y_train) print "Classes: ", self.clf.classes_ print "Score: ", self.clf.score(X_train, Y_train) def predict(self, X_inp): word_list = " ".join(w for w in word_tokenize(X_inp.decode("utf-8").lower()) if w not in self.stop_words) X_test = self.tfid.transform([word_list]) return self.clf.predict(X_test)
def run_learning_curves_experiment(dataset): logger.info("Now starting experiment with learning curves...") scores = [] sklearn_scores = [] train_sizes = [] clf = MultinomialBayesEstimator() sklearn_clf = MultinomialNB() # Constructing confidence intervals using empiric bootstrap intervals = [] for test_size in xrange(1, len(dataset)): f_scores = [] f_scores_sklearn = [] for train_set, test_set in split_train_test_p_out(dataset, test_size): train_set, test_set = split_train_test(dataset, test_size) X_train, y_train, X_test, y_test = make_test_train(train_set, test_set) clf.fit(X_train, y_train) f_scores.append(f1_score(y_test, clf.predict(X_test))) sklearn_clf.fit(X_train, y_train.ravel()) f_scores_sklearn.append(f1_score(y_test, sklearn_clf.predict(X_test))) intervals.append(calculate_confidence_interval(f_scores)) scores.append(np.mean(f_scores)) sklearn_scores.append(np.mean(f_scores_sklearn)) train_sizes.append(len(dataset) - test_size) plot_learning_curves(train_sizes, sklearn_scores, scores, intervals)
def test_sklearn_nb(balanced): movie_words = process_plots_mp(balanced) training_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 != 0] test_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 == 0] vec = DictVectorizer() training_features = vec.fit_transform([movie.wordcounts for movie in training_movies]).toarray() training_labels = np.array([movie.year for movie in training_movies]) #LOGGER.debug("Original size of feature vectors: %d (issparse: %s)" % ( #csr_matrix(training_features[-1]).toarray().size, str(issparse(training_features)) #)) mnb_classifier = MultinomialNB() mnb_classifier.fit(training_features, training_labels) test_features = vec.transform([movie.wordcounts for movie in test_movies]) test_labels = np.array([movie.year for movie in test_movies]) results = mnb_classifier.predict(test_features) correct = sum([1 for i, result in enumerate(results) if result == test_labels[i]]) LOGGER.info("skleanrn's MultinomialNB classifier predicted %d/%d correctly (%0.3f%% accuracy)" % ( correct, len(test_labels), correct / len(test_labels) * 100 ))
def train(self, data): nb = MultinomialNB() launches = map(lambda x: x['application'], data) instances = map(lambda i: {'lu1': launches[i-1]}, xrange(1, len(launches))) X = self.vectorizer.fit_transform(instances).toarray() y = launches[1:] self.lu1_predictor = nb.fit(X, y) instances = map(lambda i: {'lu2': launches[i-2]}, xrange(2, len(launches))) X = self.vectorizer.fit_transform(instances).toarray() y = launches[2:] self.lu2_predictor = nb.fit(X, y) # tune mu max_hr = 0 best_mu = 0 for mu in map(lambda x: x/10.0, xrange(11)): self.mu = mu predictions = map(lambda i: self.predict({'lu1': launches[i-1], 'lu2': launches[i-2]}), \ xrange(2, len(launches))) hr, mrr = self.test(launches[2:], predictions) if hr > max_hr: max_hr = hr best_mu = mu self.mu = best_mu
def run_k_fold_cross_validation_experiment(dataset): logger.info("Starting %d-fold cross-validation...", len(dataset)) clf_sklearn = MultinomialNB() clf = MultinomialBayesEstimator() sklearn_scores = create_scores_collector() scores = create_scores_collector() for train_set, test_set in split_train_test_k_fold(dataset): X_train, y_train, X_test, y_test = make_test_train(train_set, test_set) # Sklearn clf_sklearn.fit(X_train, y_train.ravel()) predictions = clf_sklearn.predict(X_test) sklearn_scores.append_scores(y_test, predictions) # Our bayes without ngrams clf.fit(X_train, y_train) predictions = clf.predict(X_test) scores.append_scores(y_test, predictions) logger.info("%d-fold cross validation finished", len(dataset)) log_scores(sklearn_scores, "Sklearn") log_scores(scores, "MBE")
def MultinomialNBClassify(trainData, trainLabel, testData): nbClf = MultinomialNB(alpha=0.1) # default alpha=1.0, Laplace smoothing # settinf alpha < 1 is called Lidstone smoothing nbClf.fit(trainData, ravel(trainLabel)) testLabel = nbClf.predict(testData) saveResult(testLabel, 'sklearn_MultinomialNB_alpha=0.1_Result.csv') return testLabel
def main(): print('Reading in data file...') data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv', usecols=['Sentiment', 'SentimentText'], error_bad_lines=False) print('Pre-processing tweet text...') corpus = data['SentimentText'] vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', stop_words='english', tokenizer=tokenize) X = vectorizer.fit_transform(corpus.values) y = data['Sentiment'].values print('Training sentiment classification model...') classifier = MultinomialNB() classifier.fit(X, y) print('Training word2vec model...') corpus = corpus.map(lambda x: tokenize(x)) word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4) word2vec.init_sims(replace=True) print('Fitting PCA transform...') word_vectors = [word2vec[word] for word in word2vec.vocab] pca = PCA(n_components=2) pca.fit(word_vectors) print('Saving artifacts to disk...') joblib.dump(vectorizer, path + 'vectorizer.pkl') joblib.dump(classifier, path + 'classifier.pkl') joblib.dump(pca, path + 'pca.pkl') word2vec.save(path + 'word2vec.pkl') print('Process complete.')
def naive_classify_unknown(X_train, y_train, vectorizer): client = pymongo.MongoClient("localhost", 27017) db = client.tweets clf = MultinomialNB() clf.fit(X_train, y_train) test_users = db.tweets.distinct('user.screen_name') classify_users(clf, vectorizer, test_users, load_users(db, test_users))
class NaiveBayes: def __init__(self): self.clf = MultinomialNB() self.pattern ='(?u)\\b[A-Za-z]{3,}' self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(2,2)) def train(self,fileName): print "Naive Bayes classifier is being trained" table = pandas.read_table(fileName, sep="\t", names=["cat", "message"]) X_train = self.tfidf.fit_transform(table.message) Y_train = [] for item in table.cat: Y_train.append(int(item)) self.clf.fit(X_train, Y_train) self.clf.fit(X_train, Y_train) print "Naive Bayes classifier has been trained" def classify(self,cFileName, rFileName): table = pandas.read_table(cFileName, names=["message"]) X_test = self.tfidf.transform(table.message) print "Data have been classified" with open(rFileName,'w') as f: for item in self.clf.predict(X_test).astype(str): f.write(item+'\n') def validate(self,fileName): table = pandas.read_table(fileName, sep="\t", names=["cat", "message"]) X_validate = self.tfidf.transform(table.message) Y_validated = self.clf.predict(X_validate).astype(str) totalNum = len(table.cat) errorCount = 0 for i in range(0,totalNum): if int(table.cat[i])!=int(Y_validated[i]): errorCount += 1 print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
def plain_word_counts(corpus_path): folds = KFold(article_count, n_folds=10, shuffle=True) results = [] for i, (train_idx, test_idx) in enumerate(folds): logging.info("Running fold %d" % i) vect = CountVectorizer(max_features=1000, decode_error='ignore', strip_accents='unicode') x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx)) bin = LabelEncoder() y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx)) x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx)) y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx)) model = MultinomialNB() model.fit(x_train, y_train) pred = model.predict(x_test) score = accuracy_score(y_test, pred) logging.info("Completed fold %d with score %.04f" % (i, score)) results.append(score) return results
def bcluster(corpus_path, cluster_fn): folds = KFold(article_count, n_folds=10, shuffle=True) results = [] for i, (train_idx, test_idx) in enumerate(folds): logging.info("Running fold %d" % i) vect = BrownClusterVectorizer(cluster_fn) x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx)) bin = LabelEncoder() y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx)) x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx)) y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx)) model = MultinomialNB() model.fit(x_train, y_train) pred = model.predict(x_test) score = accuracy_score(y_test, pred) logging.info("Completed fold %d with score %.04f" % (i, score)) results.append(score) return results
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print 'Loading dataset, 80% for training, 20% for testing...' movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) print 'Feature selection...' print 'fs method:' + fs_method, 'fs num:' + str(fs_num) vectorizer = CountVectorizer(binary = True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print 'Building VSM model...' term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec= vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类 doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print 'Accuracy: ', acc return acc
def find_best_vectorizor(vectorizer, grid): dg = DataGatherer() y_test = dg.validate_target y_train = dg.labeled_target nb = MultinomialNB() header_printed = False best_params = None best_score = -1 for param in IterGrid(grid): if not header_printed: print(str(",".join(param.keys())) + ",Score") header_printed = True vectorizer.set_params(**param) X_train = vectorizer.fit_transform(dg.labeled_data) X_test = vectorizer.transform(dg.validate_data) nb.fit(X_train, y_train) score = nb.score(X_test, y_test) if score > best_score: best_score = score best_params = param print(str(",".join(map(str, param.values()))) + "," + str(score)) print("") print("Best params: " + str(best_params)) print("Best score: " + str(best_score))
def train_chunk(X, Y, Xe, Ye): #clf = KNeighborsClassifier(n_neighbors=5).fit(X, Y) #clf = GaussianNB().fit(X, Y) clf = MultinomialNB().fit(X, Y) Yd = clf.predict(Xe) return stats(Ye, Yd)
print("DCDISTANCE + RF") t.classify(dataset=dataset, platform=platform, language=language, clf=RandomForestClassifier(random_state=42), parameters={'clf__n_estimators': tree_estimators}, feature_set=Model.DCDISTANCE_CODE, kfold=5) print("------------------------------------------------------") # BOW print("BOW + MultinomialNB") t.classify(dataset=dataset, platform=platform, language=language, clf=MultinomialNB(), parameters={}, feature_set=Model.BOW_CODE, kfold=5) print("------------------------------------------------------") print("BOW + KNN") t.classify(dataset=dataset, platform=platform, language=language, clf=KNeighborsClassifier(), parameters={'clf__n_neighbors': [1, 3, 5, 7]}, feature_set=Model.BOW_CODE, kfold=5) print("------------------------------------------------------")
def k_fold_cross_validation(x, y, splits, repeats): seed = 7 # classificadores para o ensemble clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1') clf2 = MultinomialNB(alpha=1130) clf3 = GaussianNB() clf4 = KNeighborsClassifier(n_neighbors=450) clf5 = ExtraTreesClassifier(random_state=seed, criterion='gini', n_estimators=1000, max_features=5) clf6 = QuadraticDiscriminantAnalysis() eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2), ('NBG', clf3), ('KNN', clf4), ('ET', clf5), ('ADQ', clf6)], voting='hard') # Algoritmos comparados models = [] models.append( ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1'))) models.append(('ADL', LinearDiscriminantAnalysis())) models.append(('ADQ', QuadraticDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier(n_neighbors=450))) models.append(('NBG', GaussianNB())) models.append(('NBM', MultinomialNB(alpha=1130))) models.append(('SVML', SVC(random_state=seed, kernel='linear', C=0.1))) models.append( ('SVMR', SVC(random_state=seed, kernel='rbf', C=1, gamma=0.0001))) models.append(('RF', RandomForestClassifier(random_state=seed, criterion='entropy', n_estimators=1000, max_features=5))) models.append(('ET', ExtraTreesClassifier(random_state=seed, criterion='gini', n_estimators=1000, max_features=5))) models.append(('ENS', eclf)) # loop que analisa cada algoritmo score = 'accuracy' results1 = [] names1 = [] mean1 = [] std1 = [] for name, model in models: kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats, random_state=seed) cv_results = model_selection.cross_val_score(model, x, y, cv=kfold, scoring=score) results1.append(cv_results) names1.append(name) mean1.append(cv_results.mean() * 100) std1.append(cv_results.std() * 100) msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std() * 100) print(msg) list_results_acc = list(zip(names1, results1)) print(list_results_acc) df_results_acc = pd.DataFrame(list_results_acc) if part_ign == 3: df_results_acc.to_csv('df_results_acc_3.csv', sep=';') if part_ign == 10: df_results_acc.to_csv('df_results_acc_10.csv', sep=';') if part_ign == 19: df_results_acc.to_csv('df_results_acc_19.csv', sep=';') if score == 'accuracy': list_acc = list(zip(names1, mean1, std1)) df_acc = pd.DataFrame(list_acc) if part_ign == 3: df_acc.to_csv('df_acc_3.csv', sep=';') if part_ign == 10: df_acc.to_csv('df_acc_10.csv', sep=';') if part_ign == 19: df_acc.to_csv('df_acc_19.csv', sep=';') # classificadores para o ensemble clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1') clf2 = MultinomialNB(alpha=15) clf3 = GaussianNB() clf4 = KNeighborsClassifier(n_neighbors=10) clf5 = ExtraTreesClassifier(random_state=seed, criterion='entropy', n_estimators=1000, max_features=17) clf6 = QuadraticDiscriminantAnalysis() eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2), ('NBG', clf3), ('KNN', clf4), ('ET', clf5), ('ADQ', clf6)], voting='hard') models = [] models.append( ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1'))) models.append(('ADL', LinearDiscriminantAnalysis())) models.append(('ADQ', QuadraticDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier(n_neighbors=10))) models.append(('NBG', GaussianNB())) models.append(('NBM', MultinomialNB(alpha=15))) models.append(('SVML', SVC(random_state=seed, kernel='linear', C=10))) models.append( ('SVMR', SVC(random_state=seed, kernel='rbf', C=10, gamma=0.001))) models.append(('RF', RandomForestClassifier(random_state=seed, criterion='gini', n_estimators=1000, max_features=17))) models.append(('ET', ExtraTreesClassifier(random_state=seed, criterion='entropy', n_estimators=1000, max_features=17))) models.append(('ENS', eclf)) # loop que analisa cada algoritmo score = 'f1_macro' results2 = [] names2 = [] mean2 = [] std2 = [] for name, model in models: kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats, random_state=seed) cv_results = model_selection.cross_val_score(model, x, y, cv=kfold, scoring=score) results2.append(cv_results) names2.append(name) mean2.append(cv_results.mean() * 100) std2.append(cv_results.std() * 100) msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std() * 100) print(msg) list_results_f1 = list(zip(names2, results2)) print(list_results_f1) df_results_f1 = pd.DataFrame(list_results_f1) if part_ign == 3: df_results_f1.to_csv('df_results_f1_3.csv', sep=';') if part_ign == 10: df_results_f1.to_csv('df_results_f1_10.csv', sep=';') if part_ign == 19: df_results_f1.to_csv('df_results_f1_10.csv', sep=';') if score == 'f1_macro': list_f1 = list(zip(names2, mean2, std2)) df_f1 = pd.DataFrame(list_f1) if part_ign == 3: df_f1.to_csv('df_f1_3.csv', sep=';') if part_ign == 10: df_f1.to_csv('df_f1_10.csv', sep=';') if part_ign == 19: df_f1.to_csv('df_f1_19.csv', sep=';') # plotando gráfico fig = plt.figure(figsize=(15, 5)) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) plt.subplot(211) plt.boxplot(results1) ax1.set_xticklabels(names1, fontsize=14) plt.ylabel('Acurácia', fontsize=18) plt.xlabel('(a)', fontsize=18) plt.yticks(rotation='horizontal', fontsize=14) plt.axhline(y=0.4656, xmin=0, xmax=1, hold=None, color='g') plt.axhline(y=0.5024, xmin=0, xmax=1, hold=None, color='b') plt.subplot(212) plt.xlabel('(b)\nClassificadores', fontsize=18) plt.boxplot(results2) plt.ylabel('F1-score', fontsize=18) ax2.set_xticklabels(names2, fontsize=14) plt.yticks(rotation='horizontal', fontsize=14) ax2.annotate( 'RL = Regressao Logistica\nADL = Analise Discr. Linear\n\ ADQ = Analise Discr. Quadratica\nKNN = K-Nearest Neighbors\n\ NBG = Naive Bayes Gaussiano\nNBM = Naive Bayes Multinomial\n\ SVML = SVM Linear\nSVMR = SVM kernel rbf\nRF = Random Forest\n\ ET = Extra Trees', # The point that we'll place the text in relation to xy=(1.01, 0.5), # Interpret the x as axes coords, and the y as figure coords xycoords=('axes fraction', 'figure fraction'), # The distance from the point that the text will be at xytext=(0, 0), # Interpret `xytext` as an offset in points... textcoords='offset points', # Any other text parameters we'd like size=12, ha='left', va='center') plt.subplot(212) plt.show(fig)
from sklearn.datasets import fetch_20newsgroups # 从sklearn.datasets里导入新闻数据抓取器 fetch_20newsgroups from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer # 从sklearn.feature_extraction.text里导入文本特征向量化模块 from sklearn.naive_bayes import MultinomialNB # 从sklean.naive_bayes里导入朴素贝叶斯模型 from sklearn.metrics import classification_report #1.数据获取 news = fetch_20newsgroups(subset='all') print(len(news.data)) # 输出数据的条数:18846 #2.数据预处理:训练集和测试集分割,文本特征向量化 X_train, X_test, y_train, y_test = train_test_split( news.data, news.target, test_size=0.25, random_state=33) # 随机采样25%的数据样本作为测试集 print(X_train[0]) #查看训练样本 print(y_train[0:100]) #查看标签 #文本特征向量化 vec = CountVectorizer() X_train = vec.fit_transform(X_train) X_test = vec.transform(X_test) #3.使用朴素贝叶斯进行训练 mnb = MultinomialNB() # 使用默认配置初始化朴素贝叶斯 mnb.fit(X_train, y_train) # 利用训练数据对模型参数进行估计 y_predict = mnb.predict(X_test) # 对参数进行预测 #4.获取结果报告 print('The Accuracy of Naive Bayes Classifier is:', mnb.score(X_test, y_test)) print(classification_report(y_test, y_predict, target_names=news.target_names))
from sklearn.svm import SVC from sklearn.metrics import classification_report, accuracy_score, confusion_matrix # Define models to train names = [ "K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier", "Naive Bayes", "SVM Linear" ] classifiers = [ KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression(), SGDClassifier(max_iter=100), MultinomialNB(), SVC(kernel='linear') ] models = zip(names, classifiers) for name, model in models: nltk_model = SklearnClassifier(model) nltk_model.train(training) accuracy = nltk.classify.accuracy(nltk_model, testing) * 100 print("{} Accuracy: {}".format(name, accuracy)) # ### Building the VotingClassifier for Ensembel Modelling # In[20]:
The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively. Refs: http://cpmarkchang.logdown.com/posts/193470-natural-language-processing-naive-bayes-classifier https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained/ """ from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import MultinomialNB from sklearn import datasets from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split iris = datasets.load_iris() x = iris.data y = iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) gnb = GaussianNB() mnb = MultinomialNB() y_pred_gnb = gnb.fit(x_train, y_train).predict(x_test) cnf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb) print(cnf_matrix_gnb) y_pred_mnb = mnb.fit(x_train, y_train).predict(x_test) cnf_matrix_mnb = confusion_matrix(y_test, y_pred_mnb) print(cnf_matrix_mnb)
classifier_NB = GaussianNB() classifier_NB.fit(X_train1, Y_train1) pred_NB_train=classifier_NB.predict(X_train1) np.mean(pred_NB_train==Y_train1) pred_NB_test=classifier_NB.predict(X_test1) np.mean(pred_NB_test==Y_test1) #Train Accuracy NB=85.74 #Test Accuracy NB=68.91 classifier_MNB = MultinomialNB() classifier_MNB.fit(X_train1, Y_train1) pred_MNB_train=classifier_MNB.predict(X_train1) np.mean(pred_MNB_train==Y_train1) pred_MNB_test=classifier_MNB.predict(X_test1) np.mean(pred_MNB_test==Y_test1) #Train Accuracy MNB=84.52 #Test Accuracy MNB=83.99 classifier_DT = DecisionTreeClassifier(criterion = "entropy", random_state = 0) classifier_DT.fit(X_train1,Y_train1) pred_DT_train=classifier_DT.predict(X_train1) np.mean(pred_DT_train==Y_train1) pred_DT_test=classifier_DT.predict(X_test1)
training_set = featuresets[:1900] testing_set = featuresets[1900:] #classifier = nltk.NaiveBayesClassifier.train(training_set) classifier_f = open("naive_bayes.picke", "rb") classifier = pickle.load(classifier_f) classifier_f.close() print("Accuracy :", (nltk.classify.accuracy(classifier, testing_set))) ### Multinomial Naive Bayes MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("MNB_classfier:", (nltk.classify.accuracy(classifier, testing_set))) ##### Gaussian Naive Bayes ##Gaussian_NB_classifier = SklearnClassifier(GaussianNB()) ##Gaussian_NB_classifier.train(training_set) ##print("GNB_classfier:", (nltk.classify.accuracy(Gaussian_NB_classifier, testing_set))) ### Bernoulli Naive Bayes BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) print("BNB_classfier:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))) #LogisticRegression, SGDClassifier
def trainNaiveBayes(trainFeatures, trainLabels): clf = make_pipeline(DictVectorizer(sparse=False), MultinomialNB()) scores = cross_val_score(clf, trainFeatures, trainLabels, cv=5) clf.fit(trainFeatures, trainLabels) return clf, scores.mean()
else: dt_clf = joblib.load('DTmodel.pkl') #test dt classifier preds = dt_clf.predict(X_test) cm = confusion_matrix(Y_test, preds) print(cm) print('\n') print(classification_report(Y_test, preds)) #plot_roc_curve(dt_clf,X_test,Y_test) plt.figure() plot_confusion_matrix(cm, classes=['negative', 'positive'], normalize=True, title='Normalized confusion matrix - Decision Tree') plt.show() #train naive bayes classifier nb_flag = 0 #if 1, train model from scratch and dump - if 0, load dumped model nb = MultinomialNB() if nb_flag: nb_clf = nb.fit(X_train, Y_train) joblib.dump(nb_clf, 'NBmodel.pkl') else: nb_clf = joblib.load('NBmodel.pkl') #test nb classifier preds = nb_clf.predict(X_test) cm = confusion_matrix(Y_test, preds) print(cm) print('\n') print(classification_report(Y_test, preds)) #plot_roc_curve(nb_clf,X_test,Y_test) plt.figure() plot_confusion_matrix(cm, classes=['negative', 'positive'], normalize=True, title='Normalized confusion matrix - Naive Bayes') plt.show()
plt.tight_layout(pad=0) plt.show() from sklearn.model_selection import train_test_split train_X, test_X, train_y, test_y = train_test_split(emails["filtered_text"], emails["spam"], test_size=0.2, random_state=10) # Bag of words with naive bayes count_vectorizer = CountVectorizer() count_vectorizer.fit(train_X) X_train_df = count_vectorizer.transform(train_X) X_test_df = count_vectorizer.transform(test_X) classifier = MultinomialNB(alpha=1.8) classifier.fit(X_train_df, train_y) pred = classifier.predict(X_test_df) accuracy_score(test_y, pred) # TF-IDF with naive bayes tf = TfidfVectorizer() tf.fit(train_X) tfidf_train_X = tf.transform(train_X) tfidf_test_X = tf.transform(test_X) classifier = MultinomialNB(alpha=0.04) classifier.fit(tfidf_train_X, train_y) pred = classifier.predict(tfidf_test_X) accuracy_score(test_y, pred)
##'''10. Split the dataset into training data and testing data with train_test_split function ##Note: parameters test_size=0.33, random_state=42''' #X_train, X_test, y_train, y_test = train_test_split(df['message'],df['label'],random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42) #print(df.shape) #print(X_train.shape) #print(X_test.shape) #11. Initialise multimimial_naive_bayes classifier from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() #12.Fit the training data with labels in Naive Bayes classifier 'clf' """ cv=CountVectorizer(stop_words='english') training_data=cv.fit_transform(X_train) testing_data=cv.transform(X_test) clf.fit(training_data,y_train) """ clf.fit(X_train, y_train) predictions = clf.predict() # #from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score # #print(accuracy_score(y_test,predictions)) #print(precision_score(y_test,predictions))
#loading the input csv data into the pandas dataframe public_griv_df = pd.read_csv("pg_complete_set_1.csv", engine='python',error_bad_lines=False) #print(public_griv_df.columns) y = public_griv_df.org_name X = public_griv_df.subject_content #splitting the data into training and testing purposes X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #pipeline for Naive Bayes algorithm txt_clf_NB=Pipeline([ ('vect',CountVectorizer()), ('clf',MultinomialNB())]) #pipeline for SVM classifier algorithm text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)) ]) #fitting and measuring accuracy for SVM model text_clf_svm.fit(X_train, y_train) #joblib.dump(text_clf_svm, 'model.pkl') #text_clf_svm = joblib.load('model.pkl') #print("Model dumped!")
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.base import TransformerMixin from joblib import dump df = pd.read_csv('FA-KES-Dataset.csv', encoding='latin1') df.drop_duplicates(keep=False, inplace=True) df['text'] = df['article_title'] + ' ' + df['article_content'] X = df["text"].values y = df["labels"].values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=345) nb = make_pipeline(CountVectorizer(binary=True), MultinomialNB()) nb.fit(X_train, y_train) y_pred = nb.predict(X_test) print(classification_report(y_test, y_pred)) nb.fit(X, y) dump(nb, "clf.joblib")
tmp_score = line[0].strip('\"') if int(tmp_score) < 2: #negative train_score.append(0) train_text.append(line[5]) elif int(tmp_score) > 2: #positive train_score.append(1) train_text.append(line[5]) else: continue text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) text_clf.fit(train_text, train_score) # Have to adjust for company name base = len('Tesla') filelist = os.listdir() for item in filelist: if item[-4:] == '.txt': year = item[base + 1:base + 5] month = dict[item[base + 6:base + 9]] day = item[base + 10:-4] date = year + '-' + month + '-' + day output_dict[date] = [] pred_file = open(item, "r", encoding="utf-8")
# split up the data df_train, df_test, Ytrain, Ytest = train_test_split(df['data'], Y, test_size=0.33) # try multiple ways of calculating features tfidf = TfidfVectorizer(decode_error='ignore') Xtrain = tfidf.fit_transform(df_train) Xtest = tfidf.transform(df_test) # count_vectorizer = CountVectorizer(decode_error='ignore') # Xtrain = count_vectorizer.fit_transform(df_train) # Xtest = count_vectorizer.transform(df_test) # create the model, train it, print scores model = MultinomialNB() model.fit(Xtrain, Ytrain) print("train score:", model.score(Xtrain, Ytrain)) print("test score:", model.score(Xtest, Ytest)) # exit() # visualize the data def visualize(label): words = '' for msg in df[df['labels'] == label]['data']: msg = msg.lower() words += msg + ' ' wordcloud = WordCloud(width=600, height=400).generate(words) plt.imshow(wordcloud)
from sklearn.model_selection import train_test_split as tts from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC # Code starts here X_train,X_val,y_train, y_val= tts(X,y,test_size=0.3, random_state=42) log_reg= LogisticRegression(random_state=0) log_reg.fit(X_train,y_train) y_pred= log_reg.predict(X_val) log_accuracy= accuracy_score(y_pred,y_val) print(log_accuracy) nb= MultinomialNB() nb.fit(X_train,y_train) y_pred= nb.predict(X_val) nb_accuracy= accuracy_score(y_pred,y_val) print(nb_accuracy) lsvm= LinearSVC(random_state=0) lsvm.fit(X_train,y_train) y_pred= lsvm.predict(X_val) lsvm_accuracy= accuracy_score(y_pred,y_val) print(lsvm_accuracy) # -------------- # path_test : Location of test data
pid_test = random.sample(list(pid), 10) df_train = df[df['product_id'].isin(pid_train)] df_test = df[df['product_id'].isin(pid_test)] #print(df_train) # Setting up Bag of Words Model count_vect = CountVectorizer() desc_train = df_train['desc'] X_train_counts = count_vect.fit_transform(list(desc_train)) print(X_train_counts.shape) print(count_vect.vocabulary_.get('images')) # Fitting tdidf vectorization tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # applying Multinominal Classifier on feature vectors obtained clf = MultinomialNB().fit(X_train_tfidf, list(df_train['Category_ID'])) # testing the model, we only need transform as for the corpus the global weights of each term are alreday learned are already learned desc_test = df_test['desc'] pid_test = df_test['product_id'] X_test_counts = count_vect.transform(list(desc_test)) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted = clf.predict(X_test_tfidf) for i, category in zip(pid_test, predicted): print("{} => {}".format(i, category))
def classifier_analysis(X, label, methodType): from sklearn.preprocessing import StandardScaler from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline #rng = None rng = np.random.RandomState(1) if methodType == 0: # random forest from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=n_jobs, random_state=rng, verbose=0, warm_start=False, class_weight=None) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__n_estimators': [5, 10, 20], 'classifier__max_depth': [None, 10, 5, 3], 'classifier__max_features': ['auto', 10, 5] } elif methodType == 1: # adaboost from sklearn.ensemble import AdaBoostClassifier classifier = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=rng) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__n_estimators': [5, 10, 20], 'classifier__learning_rate': [0.8, 0.9, 1.0] } elif methodType == 2: # GBC from sklearn.ensemble import GradientBoostingClassifier classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=rng, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__n_estimators': [50, 100, 150], 'classifier__max_depth': [None, 10, 5, 3], 'classifier__learning_rate': [0.8, 0.9, 1.0] } elif methodType == 3: # logtistic regression from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=rng, solver='saga', max_iter=100, multi_class='multinomial', verbose=0, warm_start=False, n_jobs=n_jobs) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': [0.9, 1.0, 1.1] } elif methodType == 4: # SVM from sklearn.svm import SVC classifier = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=rng) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'classifier__C': [0.9, 1.0, 1.1] } elif methodType == 5: # MLP from sklearn.neural_network import MLPClassifier classifier = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__hidden_layer_sizes': [(100, ), (50, ), (20, )], 'classifier__learning_rate_init': [0.0001, 0.001, 0.01] } elif methodType == 6: # linear SVM from sklearn.svm import LinearSVC classifier = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=rng, max_iter=1000) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': [0.9, 1.0, 1.1] } elif methodType == 7: # Bernoulli Naive Bayes from sklearn.naive_bayes import BernoulliNB classifier = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__alpha': [0.90, 0.95, 1.0], 'classifier__fit_prior': [True, False] } elif methodType == 8: # multinomial Naive Bayes from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) param_grid = { 'classifier__alpha': [0.90, 0.95, 1.0], 'classifier__fit_prior': [True, False] } else: return if methodType == 8: pipe = Pipeline([ ('classifier', classifier) ]) else: pipe = Pipeline([ ('scale', StandardScaler()), ('filter', FilterSimu()), ('classifier', classifier) ]) grid = GridSearchCV(pipe, cv=ShuffleSplit(n_splits=4, test_size=0.25, random_state=rng), n_jobs=1, param_grid=param_grid) grid.fit(X, label) best_estimator = grid.best_estimator_ #mean_scores = np.array(grid.cv_results_['mean_test_score']) #mean_tscores = np.array(grid.cv_results_['mean_train_score']) #print mean_scores #print mean_tscores print grid.best_params_ score = grid.best_score_ #print grid.cv_results_['params'] return best_estimator, grid.predict(X), score
def NB_create_model(): # 获取标题文本 text_list = [] for page_num in range(0, 50): # 页数可改 url = 'http://guba.eastmoney.com/list,gssz,f_' + \ str(page_num) + '.html' stockPageRequest = requests.get(url, headers=headers) htmlTitleContent = stockPageRequest.text resp = Selector(text=htmlTitleContent) nodes = resp.xpath( '//div[contains(@class,"articleh normal_post") or contains(@class,"articleh normal_post odd")]' ) # itemstemp = re.findall(pattern, content) for index, item in enumerate(nodes): view = item.xpath('./span[@class="l1 a1"]/text()').extract_first() comment_count = item.xpath( './span[@class="l2 a2"]/text()').extract_first() title = item.xpath( './span[@class="l3 a3"]/a/text()').extract_first() author = item.xpath( './span[@class="l4 a4"]/a/text()').extract_first() create_time = item.xpath( './span[@class="l5 a5"]/text()').extract_first() # 处理日期 date_pattern = re.search('(\d+)-(\d+)', create_time) month = sub_zero(date_pattern.group(1)) day = sub_zero(date_pattern.group(2)) seg_list = list(jieba.cut(title, cut_all=False)) seg_str = " ".join(seg_list) text_list.append(seg_str) text_list = np.array(text_list) # 文本list # 标注文本特征 class_vec = [' '] * len(text_list) # 一样长的list for i in range(0, len(text_list)): for pos in positiveWord: if pos in text_list[i]: class_vec[i] = '积极' for neg in negativeWord: if neg in text_list[i]: class_vec[i] = '消极' for neu in neutralWord: if neu in text_list[i]: class_vec[i] = '中立' if class_vec[i] == ' ': class_vec[i] = '无立场' print(class_vec[i]) # 将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() # 该类会统计每个词语的tf-idf权值 transformer = TfidfTransformer() # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 tfidf = transformer.fit_transform(vectorizer.fit_transform(text_list)) # 构造分类器 clf = MultinomialNB() clf.fit(tfidf, class_vec) # 持久化保存 joblib.dump(clf, 'Clf_v1.pkl') joblib.dump(vectorizer, 'Vect_v1') joblib.dump(transformer, 'Tf-Idf_v1')
#featuresets_bigrams = [ # document_features_ngrams(nltk.FreqDist(d), bigrams_frq) # for d in movies_reviews["bigrams"]] #featuresets_trigrams = [ # document_features_ngrams(nltk.FreqDist(d), trigrams_frq) # for d in movies_reviews["trigrams"]] elapsed_time = time.time() - start_time #for i in range(100): # print(sum(x > 0 for x in featuresets_bigrams[i])) bigrams_train, bigrams_test, biy_train, biy_test = train_test_split( featuresets_bigrams, Sentiments, test_size=0.1) # Entrenamiento de un clasificador Multinomial Bayes ingenuo clfM = MultinomialNB() clfM.fit(bigrams_train, biy_train) print(elapsed_time) # Pruebas del clasificador predictions_train = clfM.predict(bigrams_train) fails_train = sum(biy_train != predictions_train) print( "Puntos mal clasificados en el conjunto de entrenamiento: {} de {} ({}%)\n" .format(fails_train, len(bigrams_train), 100 * fails_train / len(bigrams_train))) predictions_test = clfM.predict(bigrams_test) fails_test = sum(biy_test != predictions_test) print("Puntos mal clasificados en el conjunto de prueba: {} de {} ({}%)\n". format(fails_test, len(bigrams_test), 100 * fails_test / len(bigrams_test)))
spams = [] for s in range(len(class1['TEXT'])): spams.append('Спам') class1['CLASS'] = spams hams = [] for s in range(len(class2['TEXT'])): hams.append('Не спам') class2['CLASS'] = hams class3 = pd.DataFrame(class2).append(class1) count_vector = CountVectorizer() result = count_vector.fit_transform(class3['TEXT'].values) BinClass = MultinomialNB() objects = class3['CLASS'].values BinClass.fit(result, objects) print('Введіть дані:') input_string = [input()] count_input = count_vector.transform(input_string) answers = BinClass.predict(count_input) print("Введенні дані являються:", str(answers)) #Experiments: #Words from the FirstCLass exp1 = ['Юридичний']
if __name__ == '__main__': np.random.seed(1337) unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE) tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print ('Extracting features & training batches') clf = MultinomialNB() batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): utils.write_status(i, n_train_batches) i += 1 if FEAT_TYPE == 'frequency': tfidf = apply_tf_idf(training_set_X) training_set_X = tfidf.transform(training_set_X) clf.partial_fit(training_set_X, training_set_y, classes=[0, 1, 2, 3, 4]) print ('\n') print ('Testing') if TRAIN: correct, total = 0, len(val_tweets) i = 1
dados.append([1, 0, 1]) dados.append([1, 0, 1]) dados.append([1, 0, 0]) dados.append([1, 1, 0]) dados.append([1, 1, 1]) dados.append([1, 1, 0]) dados.append([0, 1, 0]) dados.append([0, 1, 1]) dados.append([1, 1, 1]) dados.append([1, 1, 0]) dados.append([0, 1, 0]) dados.append([0, 1, 1]) marcacoes = ([1]*10) + ([0]*6) modelo = MultinomialNB() modelo.fit(dados, marcacoes) _1cervejeiro = [1, 1, 1] _2cervejeiro = [1, 0, 0] _1leiteiro = [0, 1, 1] _2leiteiro = [0, 1, 0] dados_teste = [_1cervejeiro, _2cervejeiro, _1leiteiro, _2leiteiro] marcacoes_teste = [1, 1, 0, 0] resultado = modelo.predict(dados_teste) diferencas = resultado - marcacoes_teste acertos = [d for d in diferencas if d == 0]
from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer import pickle #Importing the cleaned file containing the text and label news = pd.read_csv('news.csv') X = news['text'] y = news['label'] #Splitting the data into train X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #Creating a pipeline that first creates bag of words(after applying stopwords) & then applies Multinomial Naive Bayes model pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('nbmodel', MultinomialNB())]) #Training our data pipeline.fit(X_train, y_train) #Predicting the label for the test data pred = pipeline.predict(X_test) #Checking the performance of our model print(classification_report(y_test, pred)) print(confusion_matrix(y_test, pred)) #Serialising the file with open('model.pickle', 'wb') as handle: pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)
samples_weight = compute_sample_weight('balanced', train_df['toxicity']) print(train_df.head(5)) count_vect = CountVectorizer(stop_words='english') tfidf_transformer = TfidfTransformer() train_df['comment_text'] = train_df['comment_text'].astype('U') test_df['comment_text'] = test_df['comment_text'].astype('U') X_train_counts = count_vect.fit_transform(train_df['comment_text'].values) print(X_train_counts.shape) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, train_df['toxicity'], sample_weight=samples_weight) X_new_counts = count_vect.transform(test_df['comment_text']) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) print(Counter(train_df['toxicity'])) print(accuracy_score(predicted, test_df['toxicity'])) print(f1_score(predicted, test_df['toxicity'])) print(predicted.shape) print(classification_report(predicted, test_df['toxicity'])) # Dla progu 0.5: # (1799564, 318216)
train_tc = count_vectorizer.fit_transform(training_data.data) print('\nDimensions of training data:', train_tc.shape) # Create the tf-idf transformer tfidf = TfidfTransformer() train_tfidf = tfidf.fit_transform(train_tc) # Define test data input_data = [ 'You need to be careful with cars when you are driving on slippery roads', 'A lot of devices can be operated wirelessly', 'Players need to be careful when they are close to goal posts', 'Political debates help us understand the perspectives of both sides' ] # Train a Multinomial Naive Bayes classifier classifier = MultinomialNB().fit(train_tfidf, training_data.target) # Transform input data using count vectorizer input_tc = count_vectorizer.transform(input_data) # Transform vectorized data using tfidf transformer input_tfidf = tfidf.transform(input_tc) # Predict the output categories predictions = classifier.predict(input_tfidf) # Print the outputs for sent, category in zip(input_data, predictions): print('\nInput:', sent, '\nPredicted category:', category_map[training_data.target_names[category]])
# count_f = CountVectorizer(max_features=1000) # x_train_bow_f = count_f.fit_transform(X_train) # # count_test_f = CountVectorizer(max_features=1000) # x_test_bow_f = count_f.transform(X_test) # all words considered count = CountVectorizer(lowercase=False, token_pattern='[A-Za-z0-9#@_$%]{2,}') x_train_bow = count.fit_transform(X_train) # count_test = CountVectorizer() test_bow = count.transform(X_test) # # model takes the most frequent 1000 words # clf = MultinomialNB() # train_model_f = clf.fit(x_train_bow_f, y_train) # predict_and_test(train_model_f, x_test_bow_f) # model takes all words considered clf = MultinomialNB(alpha=1) model = clf.fit(x_train_bow, y_train) predicted_y = model.predict(test_bow) f = open("output.txt", 'a') for i in range(0, len(test_id)): f.write(str(test_id[i])) f.write(' ') f.write(predicted_y[i]) f.write('\n') f.close() # predict_and_test(train_model, x_test_bow)
print(len(content_train)) # To extract useful features from noise reduction data, we extract bag of words model features from the text vectorizer = CountVectorizer( analyzer='word', # tokenize by character ngrams ngram_range=(1, 4), # use ngrams of size 1 2 and 3 max_features=20000) # keep the most common 1000 ngrams vectorizer.fit(content_train) def get_features(content): vectorizer.transform(content) # import classifier and train data classifier = MultinomialNB() classifier.fit(vectorizer.transform(content_train), tag_train) classifier.score(vectorizer.transform(content_test), tag_test) """ cross verification part """ # A more reliable method of cross verification is StratifiedKFold, # but cross verification is the best way to ensure that each sample category is relatively balanced def stratified_k_fold(content, tag, classifier_class, shuffle=True, n_splits=5, **kwargs): sk_fold = StratifiedKFold(n_splits=n_splits, shuffle=shuffle) tag_prediction = tag[:]
def multinomial_nb_cl(params): cl=MultinomialNB(**params) return cl
def makeClassifierBayes(tfidf,result,alpha=1.0): clf = MultinomialNB(alpha=alpha).fit(tfidf, result) return clf