class Sentiment: def __init__(self): self.stop_words = stopwords.words() + list(string.punctuation) self.tfid = TfidfVectorizer() self.clf = MultinomialNB() # score: 0.7225 # self.clf = SVC() # create pipelines # clean the input def fit(self, X, Y): self.X = X self.Y = Y n = 4000 print "Fitting ", n, " samples..." words = [word_tokenize(unicodedata.normalize('NFKD', x.decode("utf-8")).encode('ascii','ignore')) for x in X[:n]] processed_words = [" ".join(w for w in s if w not in self.stop_words) for s in words] X_train = self.tfid.fit_transform(processed_words) Y_train = Y[:n] print "Classifier created" self.clf.fit(X_train, Y_train) def predict(self, X_inp): X_inp = unicodedata.normalize('NFKD', X_inp.decode("utf-8")).encode('ascii','ignore') word_list = " ".join(w for w in word_tokenize(X_inp.lower()) if w not in self.stop_words) X_test = self.tfid.transform([word_list]) return self.clf.predict(X_test)
def main(): """loads data, trains model, tests model Inputs: file: binary file containing sparse numpy array with text features file: binary file containing pandas dataframe with training labels Outs: print: classification report of classifier performance """ # Load training labels and text features chdir("../pickles") with open("word_counts.pkl", "rb") as f: X = pickle.load(f) with open("training_labels.pkl", "rb") as f: y = pickle.load(f) y = np.ravel(y["sponsored"]) # Create train and test splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Create and train model clf = MultinomialNB() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred))
class NBTest(unittest.TestCase): def setUp(self): self.mnb = NaiveBayes(multinomial=True) self.skmnb = MultinomialNB() self.bnb = NaiveBayes(bernoulli=True) self.skbnb = BernoulliNB() self.cnb = NaiveBayes(multinomial=True, cnb=True) self.wcnb = NaiveBayes(multinomial=True, wcnb=True) def test_count_vectorized(self): self.mnb.fit(X_count, train_targets) self.skmnb.fit(X_count, train_targets) self.assertEqual(self.mnb.score(X_count_test,test_targets),self.skmnb.score(X_count_test,test_targets)) def test_tfidf_vectorized(self): self.mnb.fit(X_tfidf, train_targets) self.skmnb.fit(X_tfidf, train_targets) self.assertEqual(self.mnb.score(X_tfidf_test, test_targets), self.skmnb.score(X_tfidf_test, test_targets)) def test_cnb(self): self.cnb.fit(X_count, train_targets) self.mnb.fit(X_count, train_targets) cnb_score = self.cnb.score(X_count_test, test_targets) mnb_score = self.mnb.score(X_count_test, test_targets) print "CNB: {}, MNB: {}".format(cnb_score, mnb_score) assert (cnb_score - mnb_score) > -0.1 def test_wcnb(self): self.wcnb.fit(X_count, train_targets) self.mnb.fit(X_count, train_targets) wcnb_score = self.wcnb.score(X_count_test, test_targets) mnb_score = self.mnb.score(X_count_test, test_targets) print "WCNB: {}, MNB: {}".format(wcnb_score, mnb_score) assert (wcnb_score - mnb_score) > -0.5
def classifier(): nb = MultinomialNB(alpha=0) nb.fit(DOC_TRAIN, CLASS_TRAIN) db = DB() query = 'select cate_id, tf, url, content from site_content_3' cursor = db.cursor() logger.info(query) cursor.execute(query) rows = cursor.fetchall() for row in rows: currentCateId = row['cate_id'] print 'rowID => ', row['cate_id']; url = row['url'] tf = row['tf'] content = row['content'] termFrequencyDict = {} # continue try: termFrequencyDict = json.loads(tf) except: print 'error => ', url continue testItem = np.array([]) for word in termFrequencyDict: tf = termFrequencyDict[word] if WORDS.has_key(word): testItem = np.append([tf]) else: testItem = np.append([0]) print "CURRENT CATE ", currentCateId print "NEW ", nb.predict(testItem)
def crossValidate(X_dataset,y): #cross validate model num_folds = 5 kfold = cross_validation.StratifiedKFold(y, n_folds=num_folds, shuffle=True) # kfold=KFold(X.shape[0],n_folds=10, shuffle=True) avg_accuracy=0 avg_precision=0 avg_recall=0 print "----------- cross_validation k=5" for train,test in kfold: Xtrain,Xtest,ytrain,ytest=X_dataset[train],X_dataset[test],y[train],y[test] # clf=LinearSVC() clf=MultinomialNB(alpha=0.1) # clf=LDA() clf.fit(Xtrain.toarray(),ytrain) ypred=clf.predict(Xtest.toarray()) accuracy=metrics.accuracy_score(ytest,ypred) # print "accuracy = ", accuracy avg_accuracy+=accuracy precision = metrics.precision_score(ytest,ypred) # print("precision: %0.3f" % precision) avg_precision+=precision recall = metrics.recall_score(ytest,ypred) # print("recall: %0.3f" % recall) avg_recall+=recall print "Average accuracy : " , (avg_accuracy/num_folds) print "Average precision : " , (avg_precision/num_folds) print "Average recall : " , (avg_recall/num_folds)
def naive_bayes(): nb = MultinomialNB() nb.fit(X_train, train_data.danger) nb_pred = nb.predict(X_test) nb_score = nb.score(X_test, y_test) precision, recall, _, _ = precision_recall_fscore_support(y_test, nb_pred) return precision, recall, str(nb_score)
def MultinomialNBClassify_Proba(enrollment_id, trainData, trainLabel, testData): nbClf = MultinomialNB() # default alpha=1.0, Laplace smoothing # settinf alpha < 1 is called Lidstone smoothing nbClf.fit(trainData, ravel(trainLabel)) testLabel = nbClf.predict_proba(testData)[:,1] saveResult(enrollment_id, testLabel, 'Proba_sklearn_MultinomialNB_alpha=0.1_Result.csv') return testLabel
class TrainNaiveBayes: def __init__(self, all_features, neu_labels): """ Trains a classifier using Naive Bayes """ self._num_features = len(all_features.values()[0]) self._X = numpy.zeros((1, self._num_features)) # Feature matrix self._Y = numpy.array([0]) # Label vector for user_id in neu_labels.keys(): self._X = numpy.append(self._X, [all_features[user_id]], axis=0) self._Y = numpy.append(self._Y, [neu_labels[user_id]]) self._X = numpy.delete(self._X, 0, 0) # Delete the first row (contains all 0s) self._Y = numpy.delete(self._Y, 0) print "Using MultinomialNB" self._model = MultinomialNB() print cross_validation.cross_val_score(self._model, self._X, self._Y, cv=10, scoring='f1') self._model.fit(self._X, self._Y) def predict(self, features): A = numpy.zeros((1, self._num_features)) for user_id in features.keys(): A = numpy.append(A, [features[user_id]], axis=0) A = numpy.delete(A, 0, 0) return self._model.predict(A)
def train(good_sources, bad_sources,method,naive_bayes=None,keywords=list()): #train the algorithm good_samples = find_keywords(' '.join([entry[method] for entry in good_sources])) bad_samples = find_keywords(' '.join([entry[method] for entry in bad_sources])) #if we have an exists knowledge base to append this new information to, do so if naive_bayes: new_kws = set(good_samples+bad_samples) print('Using old keywords as well') print("# old keywords = {}\n # new keywords = {}".format(len(keywords),len(new_kws))) new_kws = set(good_samples+bad_samples).difference(keywords) print("# fresh keywords = {}\n".format(len(new_kws))) #make some call to naive_bayes.partial_fssit in here X = np.concatenate((naive_bayes.feature_count_, np.zeros((naive_bayes.feature_count_.shape[0],len(new_kws)))),1) all_kw = keywords + list(new_kws) else: print('Only using keywords from this content set') all_kw = list(set(good_samples+bad_samples)) X = np.zeros((2,len(all_kw))) for j,kw in enumerate(all_kw): X[0,j] += good_samples.count(kw) X[1,j] += bad_samples.count(kw) y = ['good','bad'] naive_bayes = MultinomialNB() naive_bayes.fit(X,y) return naive_bayes, all_kw
class Sentiment: def __init__(self): self.stop_words = stopwords.words() + list(string.punctuation) self.tfid = TfidfVectorizer() self.clf = MultinomialNB() # score: 0.7225 # self.clf = SVC() # create pipelines # clean the input def fit(self, X, Y): self.X = X self.Y = Y # give the subset of dataset to be trained l = 0 h = 4000 words = [word_tokenize(x.decode("utf-8").lower()) for x in X[l:h]] processed_words = [" ".join(w for w in s if w not in self.stop_words) for s in words] X_train = self.tfid.fit_transform(processed_words) Y_train = Y[l:h] self.clf.fit(X_train, Y_train) print "Classes: ", self.clf.classes_ print "Score: ", self.clf.score(X_train, Y_train) def predict(self, X_inp): word_list = " ".join(w for w in word_tokenize(X_inp.decode("utf-8").lower()) if w not in self.stop_words) X_test = self.tfid.transform([word_list]) return self.clf.predict(X_test)
def run_naivebayes_evaluation(self, inputdata, outputdata, k): """ Fit Naive Bayes Classification on train set with cross validation. Run Naive Bayes Classificaiton on test set. Return results """ ###print "** Fitting Naive Bayes classifier.." # Cross validation cv = cross_validation.KFold(inputdata.shape[0], n_folds=k, indices=True) cv_naivebayes = [] f1_scores = [] for traincv, testcv in cv: clf_cv = MultinomialNB() clf_cv.fit(inputdata[traincv], outputdata[traincv]) y_pred_cv = clf_cv.predict(inputdata[testcv]) f1 = metrics.f1_score(outputdata[testcv], y_pred_cv, pos_label=0) f1_scores.append(f1) #TODO: NEEDED? self.classifier = clf_cv print "score average: %s" + str(np.mean(f1_scores)) average_score =np.mean(f1_scores) tuples = (average_score, f1_scores) return (tuples, 'N.A.', 'N.A.')
def test_sklearn_nb(balanced): movie_words = process_plots_mp(balanced) training_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 != 0] test_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 == 0] vec = DictVectorizer() training_features = vec.fit_transform([movie.wordcounts for movie in training_movies]).toarray() training_labels = np.array([movie.year for movie in training_movies]) #LOGGER.debug("Original size of feature vectors: %d (issparse: %s)" % ( #csr_matrix(training_features[-1]).toarray().size, str(issparse(training_features)) #)) mnb_classifier = MultinomialNB() mnb_classifier.fit(training_features, training_labels) test_features = vec.transform([movie.wordcounts for movie in test_movies]) test_labels = np.array([movie.year for movie in test_movies]) results = mnb_classifier.predict(test_features) correct = sum([1 for i, result in enumerate(results) if result == test_labels[i]]) LOGGER.info("skleanrn's MultinomialNB classifier predicted %d/%d correctly (%0.3f%% accuracy)" % ( correct, len(test_labels), correct / len(test_labels) * 100 ))
def predict(cur, plyr_id, game_plyrs): #creates training set (called 'X') for plyr all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches) games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in n_cols = all_plyrs.shape[0] #int m_rows = games.shape[0] #int w = weights(games) zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe populate_training_set(cur, X, games, plyr_id) #print "X: ", X.values ###run coaches_model and then im here### #creates vector of known output values Y = training_output_vector(cur, games, plyr_id) #good #print "(len) Y: ", len(Y), Y test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe update_training_matrix(cur, game_plyrs, 0, test_X) #run Bernoulli NB Classifier nb_clf = MultinomialNB() if len(X.values) == 0: return 0 nb_clf.fit(X, Y, sample_weight=w) nb_predictions = nb_clf.predict(test_X) #print "test_X: ", test_X.values nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) avgs = [3,8,12.5,17,21,25] #print "probs: ", nb_norm_prob #print avgs ev = expected_val(nb_norm_prob, avgs) #can also calc dot product return round(ev,1)
def run_k_fold_cross_validation_experiment(dataset): logger.info("Starting %d-fold cross-validation...", len(dataset)) clf_sklearn = MultinomialNB() clf = MultinomialBayesEstimator() sklearn_scores = create_scores_collector() scores = create_scores_collector() for train_set, test_set in split_train_test_k_fold(dataset): X_train, y_train, X_test, y_test = make_test_train(train_set, test_set) # Sklearn clf_sklearn.fit(X_train, y_train.ravel()) predictions = clf_sklearn.predict(X_test) sklearn_scores.append_scores(y_test, predictions) # Our bayes without ngrams clf.fit(X_train, y_train) predictions = clf.predict(X_test) scores.append_scores(y_test, predictions) logger.info("%d-fold cross validation finished", len(dataset)) log_scores(sklearn_scores, "Sklearn") log_scores(scores, "MBE")
def run_learning_curves_experiment(dataset): logger.info("Now starting experiment with learning curves...") scores = [] sklearn_scores = [] train_sizes = [] clf = MultinomialBayesEstimator() sklearn_clf = MultinomialNB() # Constructing confidence intervals using empiric bootstrap intervals = [] for test_size in xrange(1, len(dataset)): f_scores = [] f_scores_sklearn = [] for train_set, test_set in split_train_test_p_out(dataset, test_size): train_set, test_set = split_train_test(dataset, test_size) X_train, y_train, X_test, y_test = make_test_train(train_set, test_set) clf.fit(X_train, y_train) f_scores.append(f1_score(y_test, clf.predict(X_test))) sklearn_clf.fit(X_train, y_train.ravel()) f_scores_sklearn.append(f1_score(y_test, sklearn_clf.predict(X_test))) intervals.append(calculate_confidence_interval(f_scores)) scores.append(np.mean(f_scores)) sklearn_scores.append(np.mean(f_scores_sklearn)) train_sizes.append(len(dataset) - test_size) plot_learning_curves(train_sizes, sklearn_scores, scores, intervals)
def MultinomialNBClassify(trainData, trainLabel, testData): nbClf = MultinomialNB(alpha=0.1) # default alpha=1.0, Laplace smoothing # settinf alpha < 1 is called Lidstone smoothing nbClf.fit(trainData, ravel(trainLabel)) testLabel = nbClf.predict(testData) saveResult(testLabel, 'sklearn_MultinomialNB_alpha=0.1_Result.csv') return testLabel
def train(self, data): nb = MultinomialNB() launches = map(lambda x: x['application'], data) instances = map(lambda i: {'lu1': launches[i-1]}, xrange(1, len(launches))) X = self.vectorizer.fit_transform(instances).toarray() y = launches[1:] self.lu1_predictor = nb.fit(X, y) instances = map(lambda i: {'lu2': launches[i-2]}, xrange(2, len(launches))) X = self.vectorizer.fit_transform(instances).toarray() y = launches[2:] self.lu2_predictor = nb.fit(X, y) # tune mu max_hr = 0 best_mu = 0 for mu in map(lambda x: x/10.0, xrange(11)): self.mu = mu predictions = map(lambda i: self.predict({'lu1': launches[i-1], 'lu2': launches[i-2]}), \ xrange(2, len(launches))) hr, mrr = self.test(launches[2:], predictions) if hr > max_hr: max_hr = hr best_mu = mu self.mu = best_mu
class NaiveBayes: def __init__(self): self.clf = MultinomialNB() self.pattern ='(?u)\\b[A-Za-z]{3,}' self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(2,2)) def train(self,fileName): print "Naive Bayes classifier is being trained" table = pandas.read_table(fileName, sep="\t", names=["cat", "message"]) X_train = self.tfidf.fit_transform(table.message) Y_train = [] for item in table.cat: Y_train.append(int(item)) self.clf.fit(X_train, Y_train) self.clf.fit(X_train, Y_train) print "Naive Bayes classifier has been trained" def classify(self,cFileName, rFileName): table = pandas.read_table(cFileName, names=["message"]) X_test = self.tfidf.transform(table.message) print "Data have been classified" with open(rFileName,'w') as f: for item in self.clf.predict(X_test).astype(str): f.write(item+'\n') def validate(self,fileName): table = pandas.read_table(fileName, sep="\t", names=["cat", "message"]) X_validate = self.tfidf.transform(table.message) Y_validated = self.clf.predict(X_validate).astype(str) totalNum = len(table.cat) errorCount = 0 for i in range(0,totalNum): if int(table.cat[i])!=int(Y_validated[i]): errorCount += 1 print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
def main(): print('Reading in data file...') data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv', usecols=['Sentiment', 'SentimentText'], error_bad_lines=False) print('Pre-processing tweet text...') corpus = data['SentimentText'] vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode', stop_words='english', tokenizer=tokenize) X = vectorizer.fit_transform(corpus.values) y = data['Sentiment'].values print('Training sentiment classification model...') classifier = MultinomialNB() classifier.fit(X, y) print('Training word2vec model...') corpus = corpus.map(lambda x: tokenize(x)) word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4) word2vec.init_sims(replace=True) print('Fitting PCA transform...') word_vectors = [word2vec[word] for word in word2vec.vocab] pca = PCA(n_components=2) pca.fit(word_vectors) print('Saving artifacts to disk...') joblib.dump(vectorizer, path + 'vectorizer.pkl') joblib.dump(classifier, path + 'classifier.pkl') joblib.dump(pca, path + 'pca.pkl') word2vec.save(path + 'word2vec.pkl') print('Process complete.')
def bcluster(corpus_path, cluster_fn): folds = KFold(article_count, n_folds=10, shuffle=True) results = [] for i, (train_idx, test_idx) in enumerate(folds): logging.info("Running fold %d" % i) vect = BrownClusterVectorizer(cluster_fn) x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx)) bin = LabelEncoder() y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx)) x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx)) y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx)) model = MultinomialNB() model.fit(x_train, y_train) pred = model.predict(x_test) score = accuracy_score(y_test, pred) logging.info("Completed fold %d with score %.04f" % (i, score)) results.append(score) return results
def naive_classify_unknown(X_train, y_train, vectorizer): client = pymongo.MongoClient("localhost", 27017) db = client.tweets clf = MultinomialNB() clf.fit(X_train, y_train) test_users = db.tweets.distinct('user.screen_name') classify_users(clf, vectorizer, test_users, load_users(db, test_users))
def find_best_vectorizor(vectorizer, grid): dg = DataGatherer() y_test = dg.validate_target y_train = dg.labeled_target nb = MultinomialNB() header_printed = False best_params = None best_score = -1 for param in IterGrid(grid): if not header_printed: print(str(",".join(param.keys())) + ",Score") header_printed = True vectorizer.set_params(**param) X_train = vectorizer.fit_transform(dg.labeled_data) X_test = vectorizer.transform(dg.validate_data) nb.fit(X_train, y_train) score = nb.score(X_test, y_test) if score > best_score: best_score = score best_params = param print(str(",".join(map(str, param.values()))) + "," + str(score)) print("") print("Best params: " + str(best_params)) print("Best score: " + str(best_score))
def train(self): ''' ## -- How to predict -- ## query = "blah blah" q = list2vec(hashit(q)) clf2 = joblib.load('nb') print(clf2.predict(q)) # <--- returns type id ''' limit = self.comment_limit sqls = ["SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=1 ORDER BY time DESC LIMIT " + str(limit), "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=2 ORDER BY time DESC LIMIT " + str(limit), "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=3 ORDER BY time DESC LIMIT " + str(limit)] print "training model" comments = self.sql2list(sqls) x, y = self.featureMatrix(comments) X = list2Vec(x) Y = list2Vec(y) q = "Let's talk about food." q_vec = list2Vec(hashit(q)) ## Precicting print "Classifying" clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) clf.fit(X, Y) joblib.dump(clf, self.path, compress=9)
def train_classifiers(X_data, y_data): ############ Linear SVM: 0.908 ############# clf_LSVM = svm.SVC(kernel = 'linear') clf_LSVM.fit(X_data, y_data) ############ MultinomialNB: 0.875 ############# clf_MNB = MultinomialNB() clf_MNB.fit(X_data, y_data) ############ Random Forest: 0.910 ############# clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy') clf_RF.fit(X_data, y_data) ############ Extra Tree: 0.915 ################## clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_ETC.fit(X_data, y_data) ############ AdaBoost: 0.88 ################## clf_Ada = AdaBoostClassifier() clf_Ada.fit(X_data, y_data) ############ rbf SVM: 0.895 ############# clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf') clf_rbf.fit(X_data, y_data) ############ GradientBoosting: 0.88 ############# clf_GBC = GradientBoostingClassifier() clf_GBC.fit(X_data, y_data) return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
def naive_bayes(x_value, y_value): X = x_value y = y_value #train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123) vect = CountVectorizer() vect.fit(X_train) X_train_dtm = vect.transform(X_train) X_test_dtm = vect.transform(X_test) from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred_class = nb.predict(X_test_dtm) print 'Accuracy: ' print metrics.accuracy_score(y_test, y_pred_class) print 'Null Accuracy: ' print y_test.value_counts().head(1) / len(y_test) print 'Confusion Matrix: ' print metrics.confusion_matrix(y_test, y_pred_class)
def nb(x_train,x_test,y_train,doc_app_id,id_name_dict): clf = MultinomialNB(alpha=0.01) clf.fit(x_train,y_train) pred = clf.predict(x_test) for i in range(len(pred)): app_id = doc_app_id[i] print id_name_dict[app_id]+" "+str(pred[i])
def trainNB(xTrain, yTrain): classifier = MultinomialNB() classifier.fit(xTrain, yTrain) return classifier
def multinomialNB(devMatrix, trainMatrix, devtarget, traintarget): f = open('MNNB2.log', 'a') f.write("Making model!!!!!") print 'Making model!' clf = MultinomialNB(alpha=1, fit_prior=False) clf.fit(trainMatrix, traintarget) f.write("\n") value = ('Model: multinomial bayes with parameters ',clf.get_params(False)) print (str(value)) f.write(str(value)) f.write("\n") f.write("MSE for train: %.2f" % np.mean((clf.predict(trainMatrix) - traintarget) ** 2)) score = clf.score(trainMatrix, traintarget) f.write("\n") value = ('Score for train %.2f', score) f.write("\n") f.write("MSE for dev: %.2f" % np.mean((clf.predict(devMatrix) - devtarget) ** 2)) score = clf.score(devMatrix, devtarget) value = ('Score for dev %.2f', score) print(str(value)) f.write("\n") s = str(value) f.write(s) f.write("\n") f.write('model done') f.write("\n") f.write("\n") f.close() return score
def do_lda(x, y, folds): indexes = list(range(len(x))) shuffle(indexes) x = list(x[i] for i in indexes) y = list(y[i] for i in indexes) fold_size = len(x) / folds corrects = [] for fold in range(folds): test_x = [] train_x = [] test_y = [] train_y = [] for i in range(len(x)): fold_index = i / fold_size if fold == fold_index: test_x.append(x[i]) test_y.append(y[i]) else: train_x.append(x[i]) train_y.append(y[i]) print 'Partitioned data into fold' test_x, train_x = remove_redundant_dimensions(test_x, train_x) print 'Removed redundant dimensions' nb = MultinomialNB() nb.fit(train_x, train_y) print 'Fit NB' predictions = nb.predict(test_x) # lda = LDA() # lda.fit(train_x, train_y) # print 'Fit lda' # predictions = lda.predict(test_x) correct = sum(1 for i in range(len(predictions)) if predictions[i] == test_y[i]) print 'Did fold, correct:', correct corrects.append(correct) return corrects
def plain_word_counts(corpus_path): folds = KFold(article_count, n_folds=10, shuffle=True) results = [] for i, (train_idx, test_idx) in enumerate(folds): logging.info("Running fold %d" % i) vect = CountVectorizer(max_features=1000, decode_error='ignore', strip_accents='unicode') x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx)) bin = LabelEncoder() y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx)) x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx)) y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx)) model = MultinomialNB() model.fit(x_train, y_train) pred = model.predict(x_test) score = accuracy_score(y_test, pred) logging.info("Completed fold %d with score %.04f" % (i, score)) results.append(score) return results
def word_classification(): X, y = get_features_and_labels() model = MultinomialNB() model.fit(X, y) score = cross_val_score(model, X, y, cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=0)) return score
from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier voice_data = pd.read_csv('voice.csv') voice_data = voice_data[['meanfun', 'IQR', 'Q25', 'label']] x = voice_data.iloc[:, :-1] y = voice_data.iloc[:, -1] y = LabelEncoder().fit_transform(y) imp = SimpleImputer(missing_values=0, strategy='mean') x = imp.fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) predictionrate = [] mnb = MultinomialNB() mnb.fit(x_train, y_train) y_predict = mnb.predict(x_test) print('MultinomialNB准确率:', mnb.score(x_test, y_test)) print(classification_report(y_test, y_predict)) predictionrate.append(mnb.score(x_test, y_test)) gnb = GaussianNB() gnb.fit(x_train, y_train) y_predict = gnb.predict(x_test) print('GaussianNB准确率:', gnb.score(x_test, y_test)) print(classification_report(y_test, y_predict)) predictionrate.append(gnb.score(x_test, y_test)) scaler1 = StandardScaler() scaler1.fit(x_train) x_train = scaler1.transform(x_train)
word = vectorizer.get_feature_names() for n in word[:10]: print(n) print("单词数量:", len(word)) #将tf-idf矩阵抽取 元素w[i][j]表示j词在i类文本中的tf-idf权重 X = coo_matrix(tfidf, dtype=np.float32).toarray() #稀疏矩阵 print(X.shape) print(X[:10]) X_train = X[:len(train_labels)] X_test = X[len(train_labels):] y_train = train_labels y_test = test_labels print(len(X_train), len(X_test), len(y_train), len(y_test)) #----------------------------------------------------------------------------- #分类模型 clf = MultinomialNB() #clf = svm.LinearSVC() #clf = LogisticRegression(solver='liblinear') #clf = RandomForestClassifier(n_estimators=10) #clf = neighbors.KNeighborsClassifier(n_neighbors=7) #clf = AdaBoostClassifier() clf.fit(X_train, y_train) print('模型的准确度:{}'.format(clf.score(X_test, y_test))) pre = clf.predict(X_test) print("分类") print(len(pre), len(y_test)) print(classification_report(y_test, pre, digits=4))
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = .5) # tree classifier algorithm clf = tree.DecisionTreeClassifier() # calling the decision tree clasifier # Naive Bayes classifier algorithm from sklearn.naive_bayes import MultinomialNB # import gaussian classi nb_clf = MultinomialNB() # --- Trying one hot encoder ------ enc = OneHotEncoder(categorical_features =[0, 2, 3, 4, 5]) # One Hot encoder Specifying the categorical attributes. enc.fit(x) #fit the encoder to the data clf.fit(enc.transform(x_train), y_train) # create the learninf instance nb_clf.fit(enc.transform(x_train), y_train) # Nive Bayes - Multinomial model # prediction predictions = clf.predict(enc.transform(x_test)) prediction_NB = nb_clf.predict(enc.transform(x_test)) # Accuracy from sklearn.metrics import accuracy_score # impor accuracy score functionality print 'Accuracy tree encoded data prediction',accuracy_score(y_test, predictions) print 'Accuracy Multinomial NB data prediction', accuracy_score(y_test, prediction_NB) # Learning Curve plot from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit import numpy as np
############################################################ with open("E:/AB104/AlgorithmTest/Jieba_Booking.json", 'r') as a: data = json.load(a) data = DataFrame(data) classifier = MultinomialNB() X_train, X_test, y_train, y_test = train_test_split(data['comments'].values, data['mark'].values, test_size=0) targets = y_train # print len(targets) #241221 count_vectorizer = CountVectorizer() counts = count_vectorizer.fit_transform(X_train) # print len(X_train) #241221 classifier.fit(counts, targets) ############################################################ ## 進行檢測之結果儲存 ## ############################################################ commList_Jieba_marked = [] for i in commList_Jieba: commList_Jieba_marked_dict = {} examples = [i["comments"]] # print i["comments"] example_counts = count_vectorizer.transform(examples) predictions = classifier.predict(example_counts) commList_Jieba_marked_dict["mark"] = predictions.tolist() # print predictions commList_Jieba_marked_dict["comments"] = [i["comments"]] commList_Jieba_marked_dict["hotel"] = [i["hotel"]]
15: "world" } trainNews = pd.read_csv("./data/train.csv") testNews = pd.read_csv("./data/test.csv") xTrain = trainNews['text'] yTrain = trainNews['label'] tfidf = vect.fit(xTrain.values.astype('U')) xTrainvect = vect.fit_transform(xTrain) yTrainvect = yTrain xTestvect = vect.transform(testNews['text']) yTestvect = testNews['label'] model = MultinomialNB(alpha=0.01, fit_prior=True) model.fit(xTrainvect, yTrainvect) ypred = model.predict(xTestvect) score = accuracy_score(yTestvect, ypred) print("Accuracy: ", score) pickle.dump( model, open( "/Nepali-NLP/Nepali-News-Classification/models/news_classifier_model.pickle", 'wb')) pickle.dump( tfidf, open( "/Nepali-NLP/Nepali-News-Classification/models/news_vectorizer.pickle", "wb")) ####TEST#####
from texts import text_counter, text_training from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB intercepted_text = "I love my China." text_counts = text_counter.transform([intercepted_text]) text_classifier = MultinomialNB() text_labels = [0] * 1000 + [1] * 1000 text_classifier.fit(text_training, text_labels) final_pos = text_classifier.predict_proba(text_counts)[0][1] final_neg = text_classifier.predict_proba(text_counts)[0][0] if final_pos > final_neg: print("The text is positive.") else: print("The text is negative.")
#build a vocabulary from the training set vectorizer.fit(train_M) # a look inside #for x, y in zip(vectorizer.get_feature_names(), vectorizer.idf_): # print(x, y , sep=' : ') #transform the training set into a bag of words bow = vectorizer.transform(train_M) #instantiate the algorithm algo = MultinomialNB() #navie_bayes classifier #train it algo.fit(bow, train_L) print('I am trained to predict') #prediction time tot = len(test_M) err = 0 for lbl, msg in zip(test_L, test_M): #transform the message to be processed msg_bow = vectorizer.transform([msg]) #predict prediction = algo.predict(msg_bow) #show up print(prediction[0], lbl, sep=' : ') if prediction[0] != lbl: err += 1 print('Failure Rate :', err, '/', tot)
# Joining the stemmed words dialog = ' '.join(words) # Creating a corpus corpus.append(dialog) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=10000, ngram_range=(1, 2)) X = cv.fit_transform(corpus).toarray() y = df['genre'].values # Creating a pickle file for the CountVectorizer pickle.dump(cv, open('cv-transform.pkl', 'wb')) # Model Building from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import MultinomialNB nb_classifier = MultinomialNB(alpha=0.1) nb_classifier.fit(X_train, y_train) # Creating a pickle file for the Multinomial Naive Bayes model filename = 'movie-genre-mnb-model.pkl' pickle.dump(nb_classifier, open(filename, 'wb'))
count = 0 while True: count += 1 try: ele = next(a) TrainingData.append(list(ele)) TrainingResult.append(dataLabel) # print(type(TrainingData)) # print(type(TrainingResult)) except StopIteration: print('Training ' + dataLabel + ' :' + str(count)) break readTrainingData('C:/Research_PatternRecognition/Data/AllC16Data.csv', '16 Cell') readTrainingData('C:/Research_PatternRecognition/Data/AllC8Data.csv', '8 Cell') readTrainingData('C:/Research_PatternRecognition/Data/AllC4Data.csv', '4 Cell') model = GaussianNB() modelMultiNorm = MultinomialNB() model.fit(np.array(TrainingData).astype(np.float), np.array(TrainingResult)) modelMultiNorm.fit( np.array(TrainingData).astype(np.float), np.array(TrainingResult)) print(model.predict(np.array(TestSample).astype(np.float))) #print(model.predict_proba(np.array(TestSample).astype(np.float))) print(TestActualResult) print(modelMultiNorm.predict(np.array(TestSample).astype(np.float))) print(TestActualResult)
X_vec = cv.fit_transform(X_train_clean).toarray() # eğitim verisi print(X_vec) X_test_vect = cv.transform(X_test_clean).toarray() print("Data vektörize tamam \n --Sonuçlar gösterilecek.") print("Makine öğrenme algoritması çalıştırılıyor.") from sklearn.naive_bayes import MultinomialNB from sklearn import model_selection, svm from sklearn.metrics import accuracy_score, classification_report, confusion_matrix mn = MultinomialNB() mn.fit(X_vec, Y_train) # X_vec = Metinler, / Y_train = 0,1 lerden oluşan liste print(X_vec.shape) print(X_test_vect.shape) Y_test_pred = mn.predict(X_test_vect) print(Y_test_pred) print("Naive bayes accuracy score : ", accuracy_score(Y_test_pred, Y_test) * 100) print(classification_report(Y_test, Y_test_pred)) cnf_matrix = confusion_matrix(Y_test, Y_test_pred) labels = [0, 1]
training_label_set[index + 1] = np.vstack( (training_label_set[index + 1], negative_set)) new_lb_train = np.delete(new_lb_train, ran_doc_index, axis=0) training_label_set[index + 1] = training_label_set[index + 1][:, max_index] #for vector dataset training_data_set[index + 1] = np.vstack( (training_data_set[index + 1], getRowsFromMatrix(ran_doc_index, new_vec_lb_train))) new_vec_lb_train = np.delete(new_vec_lb_train, ran_doc_index, axis=0) # create binary classifiers binary_classifiers = [] for index in range(10): nb = MultinomialNB(alpha=0.01) nb.fit(sparse.csr_matrix(training_data_set[index + 1]), training_label_set[index + 1]) binary_classifiers.append(nb) test_binary_label = [] for row in vectorised_test_documents: generated_label = [] for classifier in binary_classifiers: generated_label.append((nb.predict(row))[0]) test_binary_label.append(generated_label) test_binary_label = np.array(test_binary_label) #remove all other classes all_class_index = [item for item in range(0, test_labels.shape[1])] col_to_delete = [x for x in all_class_index if x not in index_max_class] test_labels = np.delete(test_labels, col_to_delete, axis=1) # print("test_binary_label",test_binary_label[:3,:])
def classify(self, document): train_test_vectors = self.vectorize(document) clf = MultinomialNB() clf.fit(train_test_vectors[0], self.train_labels) return clf.predict(train_test_vectors[1])
]] #select features scaler = StandardScaler() scaler.fit(data) #prepare training and test data msk = np.random.rand(len(df)) < 0.7 train = data[msk] test = data[~msk] xtrain = np.array(train.iloc[:, 0:num_features], dtype=np.float32) ytrain = np.array(train.iloc[:, num_features:(num_features + num_classes + 1)], dtype=np.float32) xtest = np.array(test.iloc[:, 0:num_features], dtype=np.float32) ytest = np.array(test.iloc[:, num_features:(num_features + num_classes + 1)], dtype=np.float32) #Model clf = MultinomialNB() clf.fit(xtrain, ytrain) MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) # In[16]: #Validation prediction = clf.predict(xtest) correct = 0 for i in range(len(prediction)): if prediction[i] == ytest[i]: correct += 1 print(float(correct) / len(prediction))
xtrain, xtest, ytrain, ytest = train_test_split(df["text"], df["label_n"], test_size=0.20) # Feature extraction text --CountVectorizer from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() xtrain_count = cv.fit_transform(xtrain.values) xtrain_count.toarray()[:3] # Create a model from sklearn.naive_bayes import MultinomialNB model = MultinomialNB() model.fit(xtrain_count, ytrain) emails = [ 'Hey mohan, can we get together to watch footbal game tomorrow?', 'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!' ] emails_count = cv.transform(emails) model.predict(emails_count) # Check accuracy xtest_count = cv.transform(xtest) model.score(xtest_count, ytest) # Sklearn pipeline from sklearn.pipeline import Pipeline
x.append(tweets_data[i]['text']) y.append(sent['sentiment'][i]) #print(x[0].split(" ")) #print(y[0]) from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn import metrics vectorizer = CountVectorizer(stop_words='english') train_features = vectorizer.fit_transform(x) actual = y[:-500] nb = MultinomialNB() nb.fit(train_features, [int(r) for r in y]) test_features = vectorizer.transform(x[:-500]) test_try = vectorizer.transform([ "Can we all stop treating anxiety like it's a choice and something cool to have thank you" ]) test_try2 = vectorizer.transform(["I feel like drinking alchohol"]) predict2 = nb.predict(test_try) predict3 = nb.predict(test_try2) #print(predict2) predictions = nb.predict(test_features) print()
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33) from sklearn.feature_extraction.text import CountVectorizer counnt_vec = CountVectorizer() x_count_train = counnt_vec.fit_transform(x_train) x_count_test = counnt_vec.transform(x_test) from sklearn.naive_bayes import MultinomialNB mnb_count = MultinomialNB() mnb_count.fit(x_count_train, y_train) print("the accuracy of classifying 20newsgroups using Naive Bayes:", mnb_count.score(x_count_test, y_test)) y_count_predict = mnb_count.predict(x_count_test) from sklearn.metrics import classification_report print( classification_report(y_test, y_count_predict, target_names=news.target_names)) #tfidfVectorizer. from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vec = TfidfVectorizer() x_tfidf_train = tfidf_vec.fit_transform(x_train)
import numpy as np import pandas as pd data = pd.read_csv('riskcsv.csv', index_col=0) # sample = data.sample(frac=1) # sample.reset_index(drop=False) # 重新创建索引 # sample.reset_index(drop=True) # 将采样数据存到'application_train_sample.csv'文件中 # sample.to_csv('risk_sample.csv') data = pd.read_csv('riskcsv.csv', index_col=0) my_matrix = np.loadtxt(open("riskcsv.csv", "rb"), delimiter=",", skiprows=1) # print(my_matrix) # print(str(my_matrix)) # x1 = my_matrix[1:15, 0:21] print(x1) y = my_matrix[1:15, -1] print(y) x2 = my_matrix[0:1, 0:21] print(x2) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() clf.fit(x1, y) print(clf.predict(my_matrix[0:1, 0:21]))
rf_dtm.shape# (402, 1337) , 1 record error titles type(rf_dtm)# scipy.sparse.csr.csr_matrix print(rf_dtm)# non zeros locations and contents of non zeros save memory i guess? rf_Ddtm = rf_dtm.toarray()# to dense why to do this looks like same D = dense rf_Ddtm.shape# (402, 1337) type(rf_Ddtm)#numpy.ndarray print(rf_Ddtm) df_dtm = pd.DataFrame(rf_Ddtm,columns = dims) type(df_dtm)#pandas.core.frame.DataFrame df_dtm.shape# (402, 1337) print(df_dtm) mnb = MultinomialNB()#knn not working , may be good for vertical exampls where b<l,this is fat y_cl = mnb.fit(rf_Ddtm,cls) ####################################################### test_rf = pd.read_csv('Test RR.csv',names=['RepID','RepCols'],skiprows=[0],index_col = 0) test_rf.head(10) test_rf_X = rf['RepCols'] test_rfdtm = vect.transform(test_rf_X).toarray() test_predict_y_cl = mnb.predict(test_rfdtm) test_predict_y_cl.shape test_predict_y_cl.__len__() test_rf['pred_cl'] = test_predict_y_cl# 41 wrong predictions out of 402 #Visulaizations of excel download
def embedded_model(column, k): X = encoded_dataset[column] y = encoded_dataset['Label'] #Splitting the dataset into X_train, X_test, y_train, y_test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=13) #MNB Model selected_markers = [] mnb_accuracy = [] for i in range(k): i = i + 1 #Check point i #print ('Iteration with number of markers : {}' .format(i), '\n') #create a combination of marker markers = list(combinations(column, i)) #create a dictionary for marker and the model accuracy model_list = {} #for each combination, generate the Classifier, obtain the model accuracy for marker in markers: selected = list(marker) #marker_model['Marker'] = marker trainX = X_train[selected] testX = X_test[selected] #build the svm model using training data model = MultinomialNB() model.fit(trainX, y_train) #testing the model predictions = model.predict(testX) #model evaluation scores = cross_val_score(model, trainX, y_train, cv=5) #marker_model['SVC Accuracy'] = scores.mean() #marker_model['SVC std'] = scores.std()*2 marker_accuracy = scores.mean() #store the marker evaluation score model_list[marker] = marker_accuracy #check point #print (model_list) #select the most accurate model optimum = max(list(model_list.values())) #for each combination class get the optimum combination based on max accuracy mark = list(model_list.keys())[list( model_list.values()).index(optimum)] selected_markers.append(mark) optimum = round(optimum, 2) mnb_accuracy.append(optimum) #final output df1 = pd.DataFrame(list(zip(selected_markers, mnb_accuracy)), columns=['Markers', 'Accuracy']) #write the sheet as excel sheet df1.to_excel('Extremophile_classifier.xlsx')
# Check whether the DataFrames are equal print(count_df.equals(tfidf_df)) #---------------------------------------------------------------------------------------------------------------# #Training and testing the "fake news" model with CountVectorizer # Import the necessary modules from sklearn.naive_bayes import MultinomialNB from sklearn import metrics # Instantiate a Multinomial Naive Bayes classifier: nb_classifier nb_classifier = MultinomialNB() # Fit the classifier to the training data nb_classifier.fit(count_train, y_train) # Create the predicted tags: pred pred = nb_classifier.predict(count_test) # Calculate the accuracy score: score score = metrics.accuracy_score(y_test, pred) print(score) # Calculate the confusion matrix: cm cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) print(cm) #---------------------------------------------------------------------------------------------------------------# #Training and testing the "fake news" model with TfidfVectorizer
import numpy as np X = np.random.randint(5, size=(6, 100)) print(X) y = np.array([1, 2, 3, 4, 5, 6]) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() clf.fit(X, y) MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) print(clf.predict(X[2:3]))
transformer = TfidfTransformer() tfidf_train = transformer.fit_transform(countMatrix_train) tfidf_test = transformer.transform(countMatrix_test) tfidf_train2 = transformer.fit_transform(countMatrix_train2) tfidf_test2 = transformer.transform(countMatrix_test2) print tfidf_train.shape print tfidf_test.shape #X_train, X_test, y_train, y_test = inst[train], inst[test], classs[train], classs[test] clf_svm = svm.SVC(kernel='linear') clf_svm.fit(tfidf_train, y_train) clf_mNB = MultinomialNB() clf_mNB.fit(tfidf_train, y_train) clf_knn = KNeighborsClassifier() clf_knn.fit(tfidf_train, y_train) clf_ada = RandomForestClassifier(n_estimators=25) clf_ada.fit(tfidf_train, y_train) print clf_svm.score(tfidf_test, y_test) print clf_mNB.score(tfidf_test, y_test) print clf_knn.score(tfidf_test, y_test) print clf_ada.score(tfidf_test, y_test) predicted_svm = clf_svm.predict(tfidf_test) #print np.mean(predicted_svm == y_train)
test_ss_x = ss.transform(test_x) # 创建KNN分类器 knn = KNeighborsClassifier() knn.fit(train_ss_x, train_y) predict_y = knn.predict(test_ss_x) print("KNN准确率: %.4lf" % accuracy_score(predict_y, test_y)) # 创建SVM分类器 svm = SVC() svm.fit(train_ss_x, train_y) predict_y = svm.predict(test_ss_x) print('SVM准确率: %0.4lf' % accuracy_score(predict_y, test_y)) # 采用Min-Max规范化 mm = preprocessing.MinMaxScaler() train_mm_x = mm.fit_transform(train_x) test_mm_x = mm.transform(test_x) # 创建Naive Bayes分类器 mnb = MultinomialNB() mnb.fit(train_mm_x, train_y) predict_y = mnb.predict(test_mm_x) print("多项式朴素贝叶斯准确率: %.4lf" % accuracy_score(predict_y, test_y)) # 创建CART决策树分类器 dtc = DecisionTreeClassifier() dtc.fit(train_mm_x, train_y) predict_y = dtc.predict(test_mm_x) print("CART决策树准确率: %.4lf" % accuracy_score(predict_y, test_y))
def func2(): user = {} for line in fileinput.input("../../data/select/select_a"): mac = line.strip().split(" ")[0] user[mac] = True fileinput.close() cnt_0, cnt_1 = 0, 0 docMap_1, docMap_2, docMap_3, docMap_4, classMap = {}, {}, {}, {}, {} for line in fileinput.input( "../../data/feature/trace_all_statistic_filter_feature_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): if sex == 0: cnt_0 += 1 if sex == 1: cnt_1 += 1 _list = [] for f in feat: _list.append(float(f)) docMap_1[mac] = _list classMap[mac] = sex fileinput.close() print cnt_0, cnt_1 for line in fileinput.input( "../../data/feature/trace_online_statistic_filter_feature_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): _list = [] for f in feat: _list.append(float(f)) docMap_2[mac] = _list fileinput.close() for line in fileinput.input( "../../data/feature/trace_http_statistic_filter_feature_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): _list = [] for f in feat: _list.append(float(f)) docMap_3[mac] = _list fileinput.close() for line in fileinput.input("../../data/feature/keywords_normalize_sex"): part = line.strip().split(" ") mac, sex, feat = part[0], int(part[1]), part[2:] if user.has_key(mac): _list = [] for f in feat: _list.append(float(f)) docMap_4[mac] = _list fileinput.close() docList_1, docList_2, docList_3, docList_4, classList = [], [], [], [], [] # print len(user.keys()), len(docMap_1.keys()), len(docMap_2.keys()), len(docMap_3.keys()), len(docMap_4.keys()) for k, v in user.iteritems(): if k in docMap_1 and k in docMap_2 and k in docMap_3 and k in docMap_4 and k in classMap: docList_1.append(docMap_1[k]) docList_2.append(docMap_2[k]) docList_3.append(docMap_3[k]) docList_4.append(docMap_4[k]) classList.append(classMap[k]) docList_1, docList_2, docList_3, docList_4, classList = np.array( docList_1), np.array(docList_2), np.array(docList_3), np.array( docList_4), np.array(classList) min_max_scaler = preprocessing.MinMaxScaler() docList_1, docList_2, docList_3 = min_max_scaler.fit_transform( docList_1), min_max_scaler.fit_transform( docList_2), min_max_scaler.fit_transform(docList_3) cnt, errorCount = 0, 0 loo = LeaveOneOut(len(classList)) trainingdoc, trainingclass = [], [] # file = open("../../data/prediction/result","w") for train, test in loo: cnt += 1 print cnt trainingdoc_1, trainingdoc_2, trainingdoc_3, trainingdoc_4, trainingclass, testingdoc_1, testingdoc_2, testingdoc_3, testingdoc_4, testingclass\ = docList_1[train], docList_2[train], docList_3[train], docList_4[train], classList[train], docList_1[test], docList_2[test], docList_3[test], docList_4[test], classList[test] clf_1 = pipeline.Pipeline([ ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)), ('classification', svm.SVC(kernel='linear', class_weight='auto', probability=True)) ]) clf_2 = pipeline.Pipeline([ ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)), ('classification', svm.SVC(kernel='linear', class_weight='auto', probability=True)) ]) clf_3 = pipeline.Pipeline([ ('feature_selection', linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='auto', random_state=None)), ('classification', svm.SVC(kernel='linear', class_weight='auto', probability=True)) ]) gnb = MultinomialNB() clf_1.fit(trainingdoc_1, trainingclass) clf_2.fit(trainingdoc_2, trainingclass) clf_3.fit(trainingdoc_3, trainingclass) gnb.fit(trainingdoc_4, trainingclass) docList_final = [] for one in train: res_1 = clf_1.predict_proba(docList_1[one])[0] res_2 = clf_2.predict_proba(docList_2[one])[0] res_3 = clf_3.predict_proba(docList_3[one])[0] res_4 = gnb.predict_proba(docList_4[one])[0] _list = [ res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1], res_4[0], res_4[1] ] docList_final.append(_list) res_1 = clf_1.predict_proba(testingdoc_1)[0] res_2 = clf_2.predict_proba(testingdoc_2)[0] res_3 = clf_3.predict_proba(testingdoc_3)[0] res_4 = gnb.predict_proba(testingdoc_4)[0] testing_final = [ res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1], res_4[0], res_4[1] ] print testing_final
# Making the Confusion Matrix cm = confusion_matrix(y_validate, y_val_lgt_pred) class_label = ['1', '5'] df_cm = pd.DataFrame(cm, index=class_label,columns=class_label) sns.heatmap(df_cm, annot=True, fmt='d') plt.title('Confusion Matrix') plt.xlabel('Predicted Star') plt.ylabel('Actual Star') plt.show() # 3. Naive Bayes Classifier class_nbc_val = MultinomialNB() lgt_nbc_model = class_nbc_val.fit(tfidf_train, y_train) y_val_nbc_pred = lgt_nbc_model.predict(tfidf_validate) precision, recall, fscore, train_support = score(y_validate, y_val_nbc_pred, pos_label='5', average='binary') print('Precision: {} / Recall: {} / F1-Score: {} / Accuracy: {}'.format( round(precision, 3), round(recall, 3), round(fscore,3), round(acs(y_validate, y_val_nbc_pred), 3))) # Making the Confusion Matrix cm = confusion_matrix(y_validate, y_val_nbc_pred) class_label = ['1', '5'] df_cm = pd.DataFrame(cm, index=class_label,columns=class_label) sns.heatmap(df_cm, annot=True, fmt='d') plt.title('Confusion Matrix') plt.xlabel('Predicted Star') plt.ylabel('Actual Star')
X_df = df[['home', 'busca', 'logado']] Y_df = df['comprou'] Xdummies_df = pd.get_dummies(X_df) Ydummies_df = Y_df X = Xdummies_df.values Y = Ydummies_df.values porcentagem_de_treino = 0.9 tamanho_de_treino = porcentagem_de_treino * len(Y) tamanho_de_teste = len(Y) - tamanho_de_treino treino_dados = X[:int(tamanho_de_treino)] treino_marcacoes = Y[:int(tamanho_de_treino)] teste_dados = X[-int(tamanho_de_teste):] teste_marcacoes = Y[-int(tamanho_de_teste):] from sklearn.naive_bayes import MultinomialNB modelo = MultinomialNB() modelo.fit(treino_dados, treino_marcacoes) resultado = modelo.predict(teste_dados) diferencas = resultado - teste_marcacoes acertos = [d for d in diferencas if d == 0] total_de_acertos = len(acertos) total_de_elementos = len(teste_dados) taxa_de_acerto = 100.0 * total_de_acertos / total_de_elementos print taxa_de_acerto, total_de_elementos
class TextClassifier(object): """A text classifier model: - Vectorize the raw text into features. - Fit a naive bayes model to the resulting features. The work done by this class could also be done with a sklean.pipeline object. Since the author cannot guarentee that Pipelines have been introduced, he opted to write his own class implementing the model. This class is an example of coding to an interface, it implements the standard sklearn fit, predict, score interface. """ def __init__(self): self._vectorizer = TfidfVectorizer() self._classifier = MultinomialNB() def fit(self, X, y): """Fit a text classifier model. Parameters ---------- X: A numpy array or list of text fragments, to be used as predictors. y: A numpy array or python list of labels, to be used as responses. Returns ------- self: The fit model object. """ X = self._vectorizer.fit_transform(X) self._classifier.fit(X, y) return self def predict_proba(self, X): """Make probability predictions on new data. Parameters ---------- X: A numpy array or list of text fragments, to be used as predictors. Returns ------- probs: A (n_obs, n_classes) numpy array of predicted class probabilities. """ X = self._vectorizer.transform(X) return self._classifier.predict_proba(X) def predict(self, X): """Make class predictions on new data. Parameters ---------- X: A numpy array or list of text fragments, to be used as predictors. Returns ------- preds: A (n_obs,) numpy array containing the predicted class for each observation (i.e. the class with the maximal predicted class probabilitiy. """ X = self._vectorizer.transform(X) return self._classifier.predict(X) def score(self, X, y): """Return a classification accuracy score on new data. Parameters ---------- X: A numpy array or list of text fragments. y: A numpy array or python list of true class labels. """ X = self._vectorizer.transform(X) return self._classifier.score(X, y)
from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB import pickle import time categories = ['alt.atheism', 'sci.space', 'comp.graphics', 'rec.motorcycles', 'sci.electronics'] news = fetch_20newsgroups(remove=("headers", "footers", "quotes"), categories=categories) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(news.data) clf = MultinomialNB(alpha=0.01) clf.fit(vectors, news.target) pickle.dump({"vectorizer": vectorizer, "model": clf}, open("nb_model", "wb")) # pred = clf.predict(vectorizer.transform([news.data[-1]])) # print news.target_names[pred[0]]
bow_transformer = CountVectorizer(analyzer=text_process).fit(X_train) # transforming into Bag-of-Words and hence textual data to numeric.. text_bow_train = bow_transformer.transform(X_train) # transforming into Bag-of-Words and hence textual data to numeric.. text_bow_test = bow_transformer.transform(X_test) # # # # Naive Bayes # # # # In[111]: from sklearn.naive_bayes import MultinomialNB # instantiating the model with Multinomial Naive Bayes.. model = MultinomialNB() # training the model... model = model.fit(text_bow_train, y_train) # In[73]: model.score(text_bow_train, y_train) # In[74]: # Importing necessary libraries from sklearn.metrics import classification_report # getting the predictions of the Validation Set... predictions = model.predict(text_bow_test) # getting the Precision, Recall, F1-Score print(classification_report(y_test, predictions))