Python NaiveBayesClassifier.train示例

def main():
    data = []
    for verdict in ['spam', 'not_spam']:
        for files in glob.glob(PATH + verdict + "/*")[:500]:
            is_spam = True if verdict == 'spam' else False
            with open(files, "r", encoding='utf-8', errors='ignore') as f:
                for line in f:
                    if line.startswith("Subject:"):
                        subject = re.sub("^Subject: ", "", line).strip()
                        data.append((subject, is_spam))

    random.seed(0)
    train_data, test_data = split_data(data, 0.75)
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    print("Spam" if classifier.classify("Get free laptops now!") > 0.5 else
          "Not Spam")

    classified = [(subject, is_spam, classifier.classify(subject))
                  for subject, is_spam in test_data]

    count = Counter((is_spam, spam_probability > 0.5)
                    for _, is_spam, spam_probability in classified)

    spammiest_hams, hammiest_spams = most_misclassified(classified)

    print("Accuracy: ", accuracy(count))
    print("Precision: ", precision(count))
    print("Recall: ", recall(count))
    print("\nTop 5 falsely classified as spam:\n\n", spammiest_hams)
    print("\nTop 5 falsely classified as not spam:\n\n", hammiest_spams)
    print("\nMost spammiest words: ", spammiest_word(classifier))

示例#2

显示文件

文件： flask_app.py 项目： OldTraveller/CustomSpamClassifier

def spamFilterChecker():
    print('Received the JAVA Request!')
    # Get the text data from the JAVA Program.
    req_data = abhishek_request.get_json()
    text_to_be_classified = req_data['text_to_be_classified']
    print(text_to_be_classified)

    # ----------------------------------------------------------------------------
    # Make a POST request to the plino Spam API.
    # ----------------------------------------------------------------------------
    data = []
    for verdict in ['spam', 'not_spam']:
        for files in glob.glob(PATH + verdict + "/*")[:500]:
            is_spam = True if verdict == 'spam' else False
            with open(files, "r", encoding='utf-8', errors='ignore') as f:
                for line in f:
                    if line.startswith("Subject:"):
                        subject = re.sub("^Subject: ", "", line).strip()
                        data.append((subject, is_spam))

    random.seed(0)
    train_data, test_data = split_data(data, 0.80)
    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    json_response = ""
    value = classifier.classify(text_to_be_classified)
    if value < 0.9:
        json_response = "{'email_class' : 'spam'}"
    else:
        json_response = "{'email_class' : 'ham'}"
    print("====================================================")
    print("POSSIBILITY OF HAM : ", value)
    print(json_response)
    print("====================================================")
    return json_response

示例#3

显示文件

    for _, row in df.iterrows():
        if row['v1'] == 'spam':
            spam_data.append(row['v2'])
        else:
            legit_data.append(row['v2'])

    NB_classifier = NaiveBayesClassifier()

    spam_train = spam_data[:int(len(spam_data) * 2 / 3)]
    spam_test = spam_data[int(len(spam_data) * 2 / 3):]

    legit_train = legit_data[:int(len(legit_data) * 2 / 3)]
    legit_test = legit_data[int(len(legit_data) * 2 / 3):]

    NB_classifier.train(spam_train, legit_train)

    spam_accuracy = 0
    legit_accuracy = 0

    for text in spam_test:
        prediction = NB_classifier.predict(text)
        spam_accuracy += prediction
    spam_accuracy /= len(spam_test)

    for text in legit_test:
        prediction = NB_classifier.predict(text)
        legit_accuracy += 1 - prediction
    legit_accuracy /= len(legit_test)

    print("Spam Text prediction accuracy:", spam_accuracy)

示例#4

显示文件

文件： model_test.py 项目： davidadamojr/data_science_from_scratch

# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn, "r") as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)

# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]

# the lowest predicted spam probabilities among the actual spams

示例#5

显示文件

# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn, 'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
                 for _, is_spam, spam_probability in classified)

# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])

# the highest predicted spam probabilities among the non-spams
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]