Python NaiveBayes.classify示例，bayes.NaiveBayes.classify Python示例

示例#1

0

显示文件

class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row['Id'],
                                e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row['Id'],
                                    e.message)

        correct, incorrect = 0, 0
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s",
                                    row['Id'], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + '/TT/TEST_%s.eml'

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(
                    self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter,
                            e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values()
                   if category == '0')
        ham = sum(1 for category in self.classified_examples.values()
                  if category == '1')
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" \
               % (spam, ham, (float(spam) / len(self.classified_examples)),
                  (float(ham) / len(self.classified_examples)))

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct, incorrect, (float(correct) / (correct + incorrect)))

    def _store_results(self):
        with open('%s/results.csv' % self.path, 'w+') as resultscsv:
            writer = csv.DictWriter(resultscsv,
                                    fieldnames=['id', 'Prediction'])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({'id': example_num, 'Prediction': category})

示例#2

0

显示文件

文件： test.py 项目： Renl1001/MachineLearning

# -*- coding:utf-8 -*-
from bayes import NaiveBayes


def loadDataSet():
    train_samples = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
    ]
    test_samples = [['love', 'my', 'girl', 'friend'], ['stupid', 'garbage'],
                    ['Haha', 'I', 'really', "Love", "You"],
                    ['This', 'is', "my", "dog"]]
    train_classes = [0, 1, 0, 1, 0, 1]  # 0：good; 1:bad
    return train_samples, train_classes, test_samples


if __name__ == "__main__":
    train_samples, train_classes, test_samples = loadDataSet()

    clf = NaiveBayes()
    clf.train(train_samples, train_classes)
    # test:
    for item in test_samples:
        clf.classify(item)

示例#3

0

显示文件

    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
)

cmap = pickle.load(open('../hadoop/thread_views/var/cmap.b',
        'rb'))
clsfr = FMClassifier(cmap)
backend = RedisBackend()
bayes = NaiveBayes(backend=backend)
bayes.train(training_data)
# no we are ready to test the bayes filter

# TODO add support for subfeatures in features

import time
_start = time.time()
# bayes.classify(clsfr, ('aldfksjalskdjfasdflapoliticsadlskfajsldfj',), 'cnn')
# bayes.classify(clsfr, ('politics', 'aldfksjalspoliticskdjfasdflbusinessapmusicadlhomeskfajsldfj', 'music'), 'cnn')
# bayes.classify(clsfr, ('business', 'music', 'love', 'living', 'politics', 'music'), 'cnn', linear_weight_vector=True)
# bayes.classify(clsfr, ('tech', 'computers', 'news'), 'cnn')
line = 'gaming.www.myvidster.com/video/2797926/PornoTubecom_-_Keymon_Phoenix_Mister_Buck_Dee_Truth_Intrigue_and_Jermany_-_Browsin'
bayes.classify(clsfr, line.split('/'), 'myvidster.com', linear_weight_vector=True)
print (time.time() - _start), 'seconds'

示例#4

0

显示文件

文件： spam_detector.py 项目： lorenanicole/python-naive-bayes-spam-classifier

class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row["Id"], e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row["Id"], e.message)

        correct, incorrect = 0, 0
        with open("%s/labels.csv" % self.path, "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s", row["Id"], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + "/TT/TEST_%s.eml"

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter, e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values() if category == "0")
        ham = sum(1 for category in self.classified_examples.values() if category == "1")
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" % (
            spam,
            ham,
            (float(spam) / len(self.classified_examples)),
            (float(ham) / len(self.classified_examples)),
        )

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct,
            incorrect,
            (float(correct) / (correct + incorrect)),
        )

    def _store_results(self):
        with open("%s/results.csv" % self.path, "w+") as resultscsv:
            writer = csv.DictWriter(resultscsv, fieldnames=["id", "Prediction"])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({"id": example_num, "Prediction": category})