def build_training_set_from_text(text, category, sender=None, subject=None):
    text = fix_bad_unicode(text)
    text = strip_signature(text, sender)
    features = extract_bigrams(text)
    training_set = []
    training_set = training_set + [(get_feature(word), category) for word in features]
    training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Пример #2
0
def build_training_set_from_text(text, category, sender=None, subject=None):
    text = fix_bad_unicode(text)
    text = strip_signature(text, sender)
    features = extract_bigrams(text)
    training_set = []
    training_set = training_set + [(get_feature(word), category) for word in features]
    training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Пример #3
0
def classify(text, sender=None, subject=None):
    training_set = load_training_set()
    classifier = NaiveBayesClassifier.train(training_set)
    test_data = bag_of_words(extract_bigrams(text))
    if sender is not None:
        test_data[sender] = True
    if subject is not None:
        test_data[subject] = True
    classified = classifier.prob_classify(test_data)
    pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()})
    return categories[classified.max()]
Пример #4
0
def classify(text, sender=None, subject=None):
    training_set = load_training_set()
    classifier = NaiveBayesClassifier.train(training_set)
    test_data = bag_of_words(extract_bigrams(text))
    if sender is not None:
        test_data[sender] = True
    if subject is not None:
        test_data[subject] = True
    classified = classifier.prob_classify(test_data)
    pprint({
        categories[sample]: classified.prob(sample)
        for sample in classified.samples()
    })
    return categories[classified.max()]
Пример #5
0
def build_training_set(path='../email_dataset'):
    training_set = []
    files = listdir(path)
    for email_file in files:
        with io.open('{}/{}'.format(path, email_file), 'r', encoding='utf8') as email:
            print u'Parsing file: {}'.format(email_file)
            category, sender, receiver, subject = int(
                email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip()
            print u'Training set updated with: [{}]'.format(subject)
            text = fix_bad_unicode(email.read())
            text = strip_signature(text, sender)
            features = extract_bigrams(text)
            training_set = training_set + [(get_feature(word), category) for word in features]
            training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
def build_training_set(path="../email_dataset"):
    training_set = []
    files = listdir(path)
    for email_file in files:
        with io.open("{}/{}".format(path, email_file), "r", encoding="utf8") as email:
            print u"Parsing file: {}".format(email_file)
            category, sender, receiver, subject = (
                int(email.readline().strip()),
                email.readline().strip(),
                email.readline().strip(),
                email.readline().strip(),
            )
            print u"Training set updated with: [{}]".format(subject)
            text = fix_bad_unicode(email.read())
            text = strip_signature(text, sender)
            features = extract_bigrams(text)
            training_set = training_set + [(get_feature(word), category) for word in features]
            training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Пример #7
0
def create_training_dict(text, category):
    tokens = extract_bigrams(text)
    return [(bag_of_words(tokens), category)]
Пример #8
0
def create_training_dict(text, category):
    tokens = extract_bigrams(text)
    return [(bag_of_words(tokens), category)]