def build_training_set_from_text(text, category, sender=None, subject=None):
    text = fix_bad_unicode(text)
    text = strip_signature(text, sender)
    features = extract_bigrams(text)
    training_set = []
    training_set = training_set + [(get_feature(word), category) for word in features]
    training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Exemplo n.º 2
0
def build_training_set_from_text(text, category, sender=None, subject=None):
    text = fix_bad_unicode(text)
    text = strip_signature(text, sender)
    features = extract_bigrams(text)
    training_set = []
    training_set = training_set + [(get_feature(word), category) for word in features]
    training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Exemplo n.º 3
0
def build_training_set(path='../email_dataset'):
    training_set = []
    files = listdir(path)
    for email_file in files:
        with io.open('{}/{}'.format(path, email_file), 'r', encoding='utf8') as email:
            print u'Parsing file: {}'.format(email_file)
            category, sender, receiver, subject = int(
                email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip()
            print u'Training set updated with: [{}]'.format(subject)
            text = fix_bad_unicode(email.read())
            text = strip_signature(text, sender)
            features = extract_bigrams(text)
            training_set = training_set + [(get_feature(word), category) for word in features]
            training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
def build_training_set(path="../email_dataset"):
    training_set = []
    files = listdir(path)
    for email_file in files:
        with io.open("{}/{}".format(path, email_file), "r", encoding="utf8") as email:
            print u"Parsing file: {}".format(email_file)
            category, sender, receiver, subject = (
                int(email.readline().strip()),
                email.readline().strip(),
                email.readline().strip(),
                email.readline().strip(),
            )
            print u"Training set updated with: [{}]".format(subject)
            text = fix_bad_unicode(email.read())
            text = strip_signature(text, sender)
            features = extract_bigrams(text)
            training_set = training_set + [(get_feature(word), category) for word in features]
            training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set