def build_training_set_from_text(text, category, sender=None, subject=None): text = fix_bad_unicode(text) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = [] training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def classify(text, sender=None, subject=None): training_set = load_training_set() classifier = NaiveBayesClassifier.train(training_set) test_data = bag_of_words(extract_bigrams(text)) if sender is not None: test_data[sender] = True if subject is not None: test_data[subject] = True classified = classifier.prob_classify(test_data) pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()}) return categories[classified.max()]
def classify(text, sender=None, subject=None): training_set = load_training_set() classifier = NaiveBayesClassifier.train(training_set) test_data = bag_of_words(extract_bigrams(text)) if sender is not None: test_data[sender] = True if subject is not None: test_data[subject] = True classified = classifier.prob_classify(test_data) pprint({ categories[sample]: classified.prob(sample) for sample in classified.samples() }) return categories[classified.max()]
def build_training_set(path='../email_dataset'): training_set = [] files = listdir(path) for email_file in files: with io.open('{}/{}'.format(path, email_file), 'r', encoding='utf8') as email: print u'Parsing file: {}'.format(email_file) category, sender, receiver, subject = int( email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip() print u'Training set updated with: [{}]'.format(subject) text = fix_bad_unicode(email.read()) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def build_training_set(path="../email_dataset"): training_set = [] files = listdir(path) for email_file in files: with io.open("{}/{}".format(path, email_file), "r", encoding="utf8") as email: print u"Parsing file: {}".format(email_file) category, sender, receiver, subject = ( int(email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip(), ) print u"Training set updated with: [{}]".format(subject) text = fix_bad_unicode(email.read()) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def create_training_dict(text, category): tokens = extract_bigrams(text) return [(bag_of_words(tokens), category)]