示例#1
0
class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row['Id'],
                                e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row['Id'],
                                    e.message)

        correct, incorrect = 0, 0
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s",
                                    row['Id'], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + '/TT/TEST_%s.eml'

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(
                    self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter,
                            e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values()
                   if category == '0')
        ham = sum(1 for category in self.classified_examples.values()
                  if category == '1')
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" \
               % (spam, ham, (float(spam) / len(self.classified_examples)),
                  (float(ham) / len(self.classified_examples)))

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct, incorrect, (float(correct) / (correct + incorrect)))

    def _store_results(self):
        with open('%s/results.csv' % self.path, 'w+') as resultscsv:
            writer = csv.DictWriter(resultscsv,
                                    fieldnames=['id', 'Prediction'])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({'id': example_num, 'Prediction': category})
示例#2
0
# -*- coding:utf-8 -*-
from bayes import NaiveBayes


def loadDataSet():
    train_samples = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
    ]
    test_samples = [['love', 'my', 'girl', 'friend'], ['stupid', 'garbage'],
                    ['Haha', 'I', 'really', "Love", "You"],
                    ['This', 'is', "my", "dog"]]
    train_classes = [0, 1, 0, 1, 0, 1]  # 0:good; 1:bad
    return train_samples, train_classes, test_samples


if __name__ == "__main__":
    train_samples, train_classes, test_samples = loadDataSet()

    clf = NaiveBayes()
    clf.train(train_samples, train_classes)
    # test:
    for item in test_samples:
        clf.classify(item)
示例#3
0
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
)

cmap = pickle.load(open('../hadoop/thread_views/var/cmap.b',
        'rb'))
clsfr = FMClassifier(cmap)
backend = RedisBackend()
bayes = NaiveBayes(backend=backend)
bayes.train(training_data)
# no we are ready to test the bayes filter

# TODO add support for subfeatures in features

import time
_start = time.time()
# bayes.classify(clsfr, ('aldfksjalskdjfasdflapoliticsadlskfajsldfj',), 'cnn')
# bayes.classify(clsfr, ('politics', 'aldfksjalspoliticskdjfasdflbusinessapmusicadlhomeskfajsldfj', 'music'), 'cnn')
# bayes.classify(clsfr, ('business', 'music', 'love', 'living', 'politics', 'music'), 'cnn', linear_weight_vector=True)
# bayes.classify(clsfr, ('tech', 'computers', 'news'), 'cnn')
line = 'gaming.www.myvidster.com/video/2797926/PornoTubecom_-_Keymon_Phoenix_Mister_Buck_Dee_Truth_Intrigue_and_Jermany_-_Browsin'
bayes.classify(clsfr, line.split('/'), 'myvidster.com', linear_weight_vector=True)
print (time.time() - _start), 'seconds'

class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row["Id"], e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row["Id"], e.message)

        correct, incorrect = 0, 0
        with open("%s/labels.csv" % self.path, "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s", row["Id"], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + "/TT/TEST_%s.eml"

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter, e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values() if category == "0")
        ham = sum(1 for category in self.classified_examples.values() if category == "1")
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" % (
            spam,
            ham,
            (float(spam) / len(self.classified_examples)),
            (float(ham) / len(self.classified_examples)),
        )

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct,
            incorrect,
            (float(correct) / (correct + incorrect)),
        )

    def _store_results(self):
        with open("%s/results.csv" % self.path, "w+") as resultscsv:
            writer = csv.DictWriter(resultscsv, fieldnames=["id", "Prediction"])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({"id": example_num, "Prediction": category})