def train(spamDir, hamDir): featureset = [] for text in getDirContent(spamDir): featureset.append((featuresForText(text), SPAM)) for text in getDirContent(hamDir): featureset.append((featuresForText(text), NOSPAM)) shuffle(featureset) classifier = NaiveBayesClassifier.train(featureset) saveClassifier(classifier, CLASSIFIER_PATH) print "Done with learning."
def getBayesAccuracy(splitRatio=0.9): featureset = [] for text in getSpamContent(): featureset.append((featuresForText(text), SPAM)) for text in getHamContent(): featureset.append((featuresForText(text), NOSPAM)) shuffle(featureset) trainset, devset = splitByRatio(featureset, splitRatio) classifier = NaiveBayesClassifier.train(trainset) print classifier.show_most_informative_features(10) return nltk_classify.accuracy(classifier, devset)
def classify(evalDir, resultFilename): classifier = loadClassifier(CLASSIFIER_PATH) if not classifier: raise Exception("Classifier was not loaded.") print "loaded" with open(resultFilename, "w") as f: for text, filepath in iterDirContent(evalDir, yieldFilepath=True): classification = classifier.classify(featuresForText(text)) f.write("%s\t%s\n" % ( filepath, S_SPAM if classification else S_NOSPAM )) print "Classified output was saved to file '%s'." % resultFilename print "Done with classifying."