Пример #1
0
def summarize_email(raw_text, sender=None, language='english'):
    raw_text = strip_signature(raw_text, sender)
    stopwords = cachedStopWords
    sentence_list = tokenize.sent_tokenize(raw_text, language)
    word_set = [
        get_tokenized(sentence, stopwords) for sentence in sentence_list
    ]

    graph = Graph()
    pairs = itertools.combinations(enumerate(filter(None, word_set)), 2)
    for (index_a, words_a), (index_b, words_b) in pairs:
        similarity = cosine(words_a, words_b)
        if similarity > 0:
            graph.add_edge(index_a, index_b, weight=similarity)

    if not graph.edges():
        return sentence_list[0]

    ranked_sentence_indexes = pagerank(graph).items()
    sentences_by_rank = sorted(ranked_sentence_indexes,
                               key=itemgetter(1),
                               reverse=True)
    summary_size = int(math.ceil(len(sentence_list) / 3))
    best_sentences = map(itemgetter(0), sentences_by_rank[:summary_size])
    best_sentences_in_order = sorted(best_sentences)
    return ' '.join(sentence_list[index] for index in best_sentences_in_order)
def build_training_set_from_text(text, category, sender=None, subject=None):
    text = fix_bad_unicode(text)
    text = strip_signature(text, sender)
    features = extract_bigrams(text)
    training_set = []
    training_set = training_set + [(get_feature(word), category) for word in features]
    training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Пример #3
0
def build_training_set_from_text(text, category, sender=None, subject=None):
    text = fix_bad_unicode(text)
    text = strip_signature(text, sender)
    features = extract_bigrams(text)
    training_set = []
    training_set = training_set + [(get_feature(word), category) for word in features]
    training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Пример #4
0
def build_training_set(path='../email_dataset'):
    training_set = []
    files = listdir(path)
    for email_file in files:
        with io.open('{}/{}'.format(path, email_file), 'r', encoding='utf8') as email:
            print u'Parsing file: {}'.format(email_file)
            category, sender, receiver, subject = int(
                email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip()
            print u'Training set updated with: [{}]'.format(subject)
            text = fix_bad_unicode(email.read())
            text = strip_signature(text, sender)
            features = extract_bigrams(text)
            training_set = training_set + [(get_feature(word), category) for word in features]
            training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
def build_training_set(path="../email_dataset"):
    training_set = []
    files = listdir(path)
    for email_file in files:
        with io.open("{}/{}".format(path, email_file), "r", encoding="utf8") as email:
            print u"Parsing file: {}".format(email_file)
            category, sender, receiver, subject = (
                int(email.readline().strip()),
                email.readline().strip(),
                email.readline().strip(),
                email.readline().strip(),
            )
            print u"Training set updated with: [{}]".format(subject)
            text = fix_bad_unicode(email.read())
            text = strip_signature(text, sender)
            features = extract_bigrams(text)
            training_set = training_set + [(get_feature(word), category) for word in features]
            training_set = training_set + [({sender: True}, category), ({subject: True}, category)]
    return training_set
Пример #6
0
def summarize_email(raw_text, sender=None,  language='english'):
    raw_text = strip_signature(raw_text, sender)
    stopwords = cachedStopWords
    sentence_list = tokenize.sent_tokenize(raw_text, language)
    word_set = [get_tokenized(sentence, stopwords) for sentence in sentence_list]

    graph = Graph()
    pairs = itertools.combinations(enumerate(filter(None, word_set)), 2)
    for (index_a, words_a), (index_b, words_b) in pairs:
        similarity = cosine(words_a, words_b)
        if similarity > 0:
            graph.add_edge(index_a, index_b, weight=similarity)

    if not graph.edges():
        return sentence_list[0]

    ranked_sentence_indexes = pagerank(graph).items()
    sentences_by_rank = sorted(ranked_sentence_indexes, key=itemgetter(1), reverse=True)
    summary_size = int(math.ceil(len(sentence_list) / 3))
    best_sentences = map(itemgetter(0), sentences_by_rank[:summary_size])
    best_sentences_in_order = sorted(best_sentences)
    return ' '.join(sentence_list[index] for index in best_sentences_in_order)