def testNaiveBayesSK2(self): categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) vectorizer = TfidfVectorizer() # Both vectors and vectors_test are SciPy CSR matrix vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) nb = NaiveBayes(sqlCtx) nb.fit(vectors, newsgroups_train.target) pred = nb.predict(vectors_test) score = metrics.f1_score(newsgroups_test.target, pred, average='weighted') self.failUnless(score > 0.8)
def testNaiveBayesSK2(self): categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space' ] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) vectorizer = TfidfVectorizer() # Both vectors and vectors_test are SciPy CSR matrix vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) nb = NaiveBayes(sqlCtx) nb.fit(vectors, newsgroups_train.target) pred = nb.predict(vectors_test) score = metrics.f1_score(newsgroups_test.target, pred, average='weighted') self.failUnless(score > 0.8)
def testNaiveBayesSK1(self): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) X_train = X_digits[:.9 * n_samples] y_train = y_digits[:.9 * n_samples] X_test = X_digits[.9 * n_samples:] y_test = y_digits[.9 * n_samples:] nb = NaiveBayes(sqlCtx) score = nb.fit(X_train, y_train).score(X_test, y_test) self.failUnless(score > 0.85)
def test_naive_bayes(self): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) X_train = X_digits[:int(.9 * n_samples)] y_train = y_digits[:int(.9 * n_samples)] X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] nb = NaiveBayes(sqlCtx) score = nb.fit(X_train, y_train).score(X_test, y_test) self.failUnless(score > 0.8)
def test_naive_bayes1(self): categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) vectorizer = TfidfVectorizer() # Both vectors and vectors_test are SciPy CSR matrix vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) nb = NaiveBayes(sparkSession) mllearn_predicted = nb.fit(vectors, newsgroups_train.target).predict(vectors_test) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() sklearn_predicted = clf.fit(vectors, newsgroups_train.target).predict(vectors_test) self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
def test_naive_bayes(self): digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) X_train = X_digits[:int(.9 * n_samples)] y_train = y_digits[:int(.9 * n_samples)] X_test = X_digits[int(.9 * n_samples):] y_test = y_digits[int(.9 * n_samples):] nb = NaiveBayes(sparkSession) mllearn_predicted = nb.fit(X_train, y_train).predict(X_test) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() sklearn_predicted = clf.fit(X_train, y_train).predict(X_test) self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )