def summarize_email(raw_text, sender=None, language='english'): raw_text = strip_signature(raw_text, sender) stopwords = cachedStopWords sentence_list = tokenize.sent_tokenize(raw_text, language) word_set = [ get_tokenized(sentence, stopwords) for sentence in sentence_list ] graph = Graph() pairs = itertools.combinations(enumerate(filter(None, word_set)), 2) for (index_a, words_a), (index_b, words_b) in pairs: similarity = cosine(words_a, words_b) if similarity > 0: graph.add_edge(index_a, index_b, weight=similarity) if not graph.edges(): return sentence_list[0] ranked_sentence_indexes = pagerank(graph).items() sentences_by_rank = sorted(ranked_sentence_indexes, key=itemgetter(1), reverse=True) summary_size = int(math.ceil(len(sentence_list) / 3)) best_sentences = map(itemgetter(0), sentences_by_rank[:summary_size]) best_sentences_in_order = sorted(best_sentences) return ' '.join(sentence_list[index] for index in best_sentences_in_order)
def build_training_set_from_text(text, category, sender=None, subject=None): text = fix_bad_unicode(text) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = [] training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def build_training_set(path='../email_dataset'): training_set = [] files = listdir(path) for email_file in files: with io.open('{}/{}'.format(path, email_file), 'r', encoding='utf8') as email: print u'Parsing file: {}'.format(email_file) category, sender, receiver, subject = int( email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip() print u'Training set updated with: [{}]'.format(subject) text = fix_bad_unicode(email.read()) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def build_training_set(path="../email_dataset"): training_set = [] files = listdir(path) for email_file in files: with io.open("{}/{}".format(path, email_file), "r", encoding="utf8") as email: print u"Parsing file: {}".format(email_file) category, sender, receiver, subject = ( int(email.readline().strip()), email.readline().strip(), email.readline().strip(), email.readline().strip(), ) print u"Training set updated with: [{}]".format(subject) text = fix_bad_unicode(email.read()) text = strip_signature(text, sender) features = extract_bigrams(text) training_set = training_set + [(get_feature(word), category) for word in features] training_set = training_set + [({sender: True}, category), ({subject: True}, category)] return training_set
def summarize_email(raw_text, sender=None, language='english'): raw_text = strip_signature(raw_text, sender) stopwords = cachedStopWords sentence_list = tokenize.sent_tokenize(raw_text, language) word_set = [get_tokenized(sentence, stopwords) for sentence in sentence_list] graph = Graph() pairs = itertools.combinations(enumerate(filter(None, word_set)), 2) for (index_a, words_a), (index_b, words_b) in pairs: similarity = cosine(words_a, words_b) if similarity > 0: graph.add_edge(index_a, index_b, weight=similarity) if not graph.edges(): return sentence_list[0] ranked_sentence_indexes = pagerank(graph).items() sentences_by_rank = sorted(ranked_sentence_indexes, key=itemgetter(1), reverse=True) summary_size = int(math.ceil(len(sentence_list) / 3)) best_sentences = map(itemgetter(0), sentences_by_rank[:summary_size]) best_sentences_in_order = sorted(best_sentences) return ' '.join(sentence_list[index] for index in best_sentences_in_order)