classifier.train(document) # We can now ask it questions about unknown e-mails: print(classifier.classify("win money")) # False: most likely spam. print(classifier.classify("fix bug")) # True: most likely a real message. print() # False: people don't talk like this on developer lists... print(classifier.classify("customer")) # True: because most likely everyone knows everyone. print(classifier.classify("guys")) print() # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print(k_fold_cv(NB, documents=m, folds=10)) # This yields 5 scores: (Accuracy, Precision, Recall, F-score, standard deviation). # Accuracy in itself is not very useful, # since some spam may have been regarded as real messages (false positives), # and some real messages may have been regarded as spam (false negatives). # Precision = how accurately false positives are discarded, # Recall = how accurately false negatives are discarded. # F-score = harmonic mean of precision and recall. # stdev = folds' variation from average F-score.
# Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print classifier.classify("win money") # False: most likely spam. print classifier.classify("fix bug") # True: most likely a real message. print print classifier.classify("customer") # False: people don't talk like this on developer lists... print classifier.classify("guys") # True: because most likely everyone knows everyone. print # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print k_fold_cv(Bayes, documents=m, folds=10) # This yields 4 scores: (Accuracy, Precision, Recall, F-score). # Accuracy in itself is not very useful, # since some spam may have been regarded as real messages (false positives), # and some real messages may have been regarded as spam (false negatives). # Precision = how accurately false positives are discarded, # Recall = how accurately false negatives are discarded. # F-score = harmonic mean of precision and recall.
# for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print classifier.classify("win money") # False: most likely spam. print classifier.classify("fix bug") # True: most likely a real message. print print classifier.classify( "customer") # False: people don't talk like this on developer lists... print classifier.classify( "guys") # True: because most likely everyone knows everyone. print # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print k_fold_cv(Bayes, documents=m, folds=10) # This yields 4 scores: (Accuracy, Precision, Recall, F-score). # Accuracy in itself is not very useful, # since some spam may have been regarded as real messages (false positives), # and some real messages may have been regarded as spam (false negatives). # Precision = how accurately false positives are discarded, # Recall = how accurately false negatives are discarded. # F-score = harmonic mean of precision and recall.
# This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print(classifier.classify("win money")) # False: most likely spam. print(classifier.classify("fix bug")) # True: most likely a real message. print() print(classifier.classify("customer")) # False: people don't talk like this on developer lists... print(classifier.classify("guys")) # True: because most likely everyone knows everyone. print() # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print(k_fold_cv(NB, documents=m, folds=10)) # This yields 5 scores: (Accuracy, Precision, Recall, F-score, standard deviation). # Accuracy in itself is not very useful, # since some spam may have been regarded as real messages (false positives), # and some real messages may have been regarded as spam (false negatives). # Precision = how accurately false positives are discarded, # Recall = how accurately false negatives are discarded. # F-score = harmonic mean of precision and recall. # stdev = folds' variation from average F-score.
# Each document has a type: True for real e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = Bayes() for document in corpus: classifier.train(document) # We can now ask it questions about unknown e-mails: print classifier.classify("win money") # False: most likely spam. print classifier.classify("fix bug") # True: most likely a real message. print print classifier.classify("customer") # False: people don't talk like this on developer lists... print classifier.classify("guys") # True: because most likely everyone knows everyone. print # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print k_fold_cv(Bayes, folds=10, documents=corpus) # This yields 4 scores: Accuracy, Precision, Recall and F-score. # Accuracy in itself is not very useful, # since some spam may have been regarded as real messages (false positives), # and some real messages may have been regarded as spam (false negatives). # Precision = how accurate false positives are discarded, # Recall = how accurate false negatives are discarded.