示例#1
0
文件: sum.py 项目: kalki7/CaseSum
def sumbasic(parser, sentence_count):
    summarizer_5 = SumBasicSummarizer(Stemmer(language))
    summarizer_5.stop_words = get_stop_words(language)
    summary_5 = summarizer_5(parser.document, 5)
    temp = ''
    for sentence in summary_5:
        temp = temp + str(sentence)
    return (temp)
示例#2
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
示例#3
0
def sumbasic_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = SumBasicSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
示例#4
0
def sumbasicReferenceSummary(path):
    sentencesList = []
    parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        #print(sentence._text)
        sentencesList.append(sentence._text)

    return sentencesList
示例#5
0
def sumbasicReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = SumBasicSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
示例#6
0
def SumBasic(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = SumBasicSummarizer(stemmer)  # LSA算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
示例#7
0
def summary(url):
  parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
  stemmer = Stemmer(LANGUAGE)

  summarizer = Summarizer(stemmer)
  summarizer.stop_words = get_stop_words(LANGUAGE)

  res = []

  for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(type(sentence))
    res.append(sentence._text)

  return res
示例#8
0
def basic_sum(file, test_ratio=0.10, israndom=True):
    # extract test files
    file_lines = file.read().splitlines()
    nsamples = len(file_lines)
    ntests = int(nsamples * test_ratio)
    if israndom:
        seq = np.random.permutation(nsamples)
    else:
        seq = np.arange(nsamples)
    
    # summerizer
    stemmer = Stemmer(_language)
    summarizer = Summarizer (stemmer)
    summarizer.stop_words = get_stop_words(_language)
    
    # rouge
    rouge = Rouge155()
    
    scores = defaultdict(list)
    for i in range(ntests):
        line = file_lines[seq[i]]
        sample = json.loads(line)
        content = sample['content']
        title = sample['title']
        ref_text = {'A': title}
        doc = ' '.join(content)
        parser = PlaintextParser.from_string(doc, Tokenizer(_language))
        sum_sents = summarizer(parser.document, _sent_count)
        if len(sum_sents) != _sent_count:
            continue
        summary = str(sum_sents[0])
        score = rouge.score_summary(summary, ref_text)
        for k, v in score.items():
            scores[k].append(v)
        print('{} / {} processed.'.format(i, ntests), end='\r')
    result = {}
    for k, v in scores.items():
        result[k] = mean(v)
    return result
示例#9
0
def _build_summarizer(stop_words, stemmer=None):
    summarizer = SumBasicSummarizer(
    ) if stemmer is None else SumBasicSummarizer(stemmer)
    summarizer.stop_words = stop_words
    return summarizer
示例#10
0
 def _build_summarizer(self, stop_words):
     summarizer = SumBasicSummarizer()
     summarizer.stop_words = stop_words
     return summarizer
示例#11
0
import os


#create folder
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)


LANGUAGE = "bangla"
SENTENCES_COUNT = 2

if __name__ == "__main__":

    createFolder('Dataset/NCTB/SumBasicSummary/')
    for i in range(1, 140):
        serial_no = str(i)
        path = "Dataset/NCTB/Source/" + serial_no + ".txt"
        parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = ""
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary = summary + " " + str(sentence)
        fi = open('Dataset/NCTB/SumBasicSummary/' + serial_no + '.txt', '+w')
        fi.write(summary)
示例#12
0
def run_SumBasic(stemmer, document, n):
    luhn = SumBasicSummarizer(stemmer)
    luhn.stop_words = get_stop_words(LANGUAGE)
    print("SumBasic: {}".format(n))
    res = luhn(document, SENTENCES_COUNT)
    return " ".join(str(res[0]).split()[:n])
示例#13
0
def sum_basic(text, config={'summary_length': 1}):
    summarizer = SumBasicSummarizer(stemmer.lemmatize)
    summarizer.stop_words = STOP_WORDS
    parser = PlaintextParser.from_string(text, Tokenizer('english'))
    summary = summarizer(parser.document, config['summary_length'])
    return ' '.join([str(s) for s in summary])
 def __summarize(self, parser):
     summarizer = SumBasicSummarizer(Stemmer(self.__language))
     summarizer.stop_words = get_stop_words(self.__language)
     final_sentences = summarizer(parser.document, self.__sentences_count)
     return self.__join_sentences(final_sentences)
示例#15
0
 def _build_summarizer(self, stop_words):
     summarizer = SumBasicSummarizer()
     summarizer.stop_words = stop_words
     return summarizer
示例#16
0
def textteaser_test():

    summary = open("summary_list.txt", "a", encoding='utf-8-sig')
    sys.stdout = summary

    # obtain the input article from url
    #url = "http://www.nytimes.com/2016/11/17/us/politics/donald-trump-administration-twitter.html?ref=politics"
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # obtain the input article from plain text files
    parser = PlaintextParser.from_file("input_sample.txt", Tokenizer(LANGUAGE))

    # define the language, by dafult it is English
    stemmer = Stemmer(LANGUAGE)

    # SumBasic algorithm
    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("SumBasic:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LSA algorithm
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("Latent Semantic Analysis:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # TextRank algorithm
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("TextRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LexRank algorithm
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("LexRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    #Featured-LexRank algorithm
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        first_line = f.readline()
    title = first_line
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        text = f.read()
    tt = TextTeaser()

    sentences = tt.summarize(title, text)
    file = open("tt.txt", "w", encoding='utf-8-sig')
    print("Featured-LexRank:")
    for sentence in sentences:
        file.write("%s\n" % sentence)
    file.close()

    parser = PlaintextParser.from_file("tt.txt", Tokenizer(LANGUAGE))
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    summary.close()
示例#17
0
def build_sum_basic(parser, language):
    summarizer = SumBasicSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer 
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import sys


LANGUAGE = "english"
SENTENCES_COUNT = int(sys.argv[2])
text_file = sys.argv[1]


if __name__ == "__main__":
    
    parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)