Exemplo n.º 1
0
    def getText(self, sentence_count=None):
        if sentence_count:
            self.SENTENCE_COUNT = sentence_count
        parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        text_list = []

        for sentence in summarizer(parser.document, self.SENTENCE_COUNT):
            text_list.append(str(sentence))
        return "\n".join(text_list)
Exemplo n.º 2
0
    def process_item(self, item, spider):
        # pickle_path = os.path.dirname(os.path.realpath(__file__))+'/../../nltk_data/tokenizers/punkt/english.pickle'
        # tokenizer = nltk.data.load(pickle_path)

        parser = PlaintextParser.from_string(item['content'],
                                             Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        item['summary'] = ''
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            item['summary'] += ' ' + sentence._text
        return item
Exemplo n.º 3
0
 def init_model(self, model_type):
     self.stemmer = Stemmer('english')
     if model_type == 'lsa':
         self.summarizer = Summarizer(self.stemmer)
     elif model_type == 'lexrank':
         self.summarizer = lxrSummarizer(self.stemmer)
     elif model_type == 'textrank':
         self.summarizer = texrSummarizer(self.stemmer)
     elif model_type == 'luhn':
         self.summarizer = luhSummarizer(self.stemmer)
     elif model_type == 'kl':
         self.summarizer = klSummarizer(self.stemmer)
     elif model_type == 'edmun':
         self.summarizer = edmSummarizer(self.stemmer)
Exemplo n.º 4
0
def summarize_news(url):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    document = parser.document
    summary = []
    for sentence in summarizer(document, SENTENCES_COUNT):
        summary.append(sentence._text)

    print(' '.join(summary).encode('utf-8'))
    print(document.paragraphs[0].sentences[0]._text)
Exemplo n.º 5
0
    def summ(url):
        # url = "https://www.hindustantimes.com/tech/samsung-galaxy-note-9-launch-live-full-specifications-features-and-more/story-heLEeZMY2rl2j55Wd5LWgP.html"
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = " "
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
        	summary = summary+str(sentence)

        return summary
Exemplo n.º 6
0
def SumBasic(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = SumBasicSummarizer(stemmer)  # LSA算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
def summarize_lsa(document, sentences_count=SENTENCES_COUNT):
    parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE))

    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # parser = PlaintextParser.from_file("covid.txt", Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    result = summarizer(parser.document, sentences_count)
    summary = [str(i) for i in list(result)]
    return summary
Exemplo n.º 8
0
def summarize(text,
              summarizer,
              sentence_count,
              bonus_words=['MLK, rights'],
              language='english'):
    summarizer = summarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    if isinstance(summarizer, EdmundsonSummarizer):
        summarizer.bonus_words = bonus_words
        summarizer.stigma_words = ['zdfgthdvndadv']
        summarizer.null_words = summarizer.stop_words
    summary = summarizer(
        PlaintextParser(text, Tokenizer(language)).document, sentence_count)
    return summary
Exemplo n.º 9
0
def lets_summarize(url):
    LANGUAGE = "english"
    SENTENCES_COUNT = 3
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summary = []
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary.append(sentence)
    return summary
Exemplo n.º 10
0
def evaluate_summary(file_name, input_dir, sent_count, lingua_franca_summary, show_summaries):
	method_name = inspect.stack()[0][3]
	try:
		process_logger.debug("in "+ method_name +" method")
		file_model_summary = open(input_dir + file_name +".model", "r")
		model_summary = file_model_summary.read()

		rouge_scores_dict = {}
		rouge_scores = rouge_evaluation(lingua_franca_summary, model_summary)
		rouge_scores_dict[">>LINGUA FRANCA"] = rouge_scores
		file_summary = open("Test System Summary/" + file_name + "-" + "LINGUA FRANCA" + ".txt", "w")
		file_summary.write(lingua_franca_summary)

		LANGUAGE = "english"
		parser = PlaintextParser.from_file(input_dir + file_name + ".txt", Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		
		lsa_summarizer = LsaSummarizer(stemmer)
		rouge_scores = sumy_summarizers("LSA", lsa_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["LSA"] = rouge_scores		

		lex_summarizer = LexRankSummarizer(stemmer)
		rouge_scores = sumy_summarizers("LEX RANK", lex_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["LEX RANK"] = rouge_scores

		luhn_summarizer = LuhnSummarizer(stemmer)
		rouge_scores = sumy_summarizers("LUHN", luhn_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["LUHN"] = rouge_scores
		
		text_rank_summarizer = TextRankSummarizer(stemmer)
		rouge_scores = sumy_summarizers("TEXT RANK", text_rank_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["TEXT RANK"] = rouge_scores
		
		sum_basic_summarizer = SumBasicSummarizer(stemmer)
		rouge_scores = sumy_summarizers("SUM BASIC", sum_basic_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["SUM BASIC"] = rouge_scores
		
		kl_summarizer = KLSummarizer(stemmer)
		rouge_scores = sumy_summarizers("KL SUM", kl_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["KL SUM"] = rouge_scores
		
		# score_reader(rouge_scores_dict)
		df_rouge, summarizer_list = process_rouge_scores(rouge_scores_dict)

		return df_rouge, summarizer_list

	except Exception as Ex:
		error_logger.error("Exception occurred in " + method_name + "| Exception:" + str(Ex))
		return None
Exemplo n.º 11
0
def summarize():
    """ Returns summary of articles """
    if request.method == 'POST':
        url = request.form['pageurl']
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        final = []

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            final.append(str(sentence))
        return render_template('result.html', len=len(final), summary=final)
Exemplo n.º 12
0
def generate_luhn_summary(input_text, top_n):
    parser = PlaintextParser.from_string(input_text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarize_text=[]

    summarizer = Summarizer4(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, top_n):
        text = str(sentence).strip()
        summarize_text.append(text)
        
    final_text = "".join(summarize_text)
    print(final_text)
    return final_text
Exemplo n.º 13
0
def webBrowse():
    SENTENCES_COUNT = numOfSent.get()
    parser = HtmlParser.from_url(url.get(), Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizerurl(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    outputFile = open("C://Users//rakesh chandra//Desktop//ATS//outputU.txt",
                      'w')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        outputFile.write("->  ")
        outputFile.write(str(sentence))
        outputFile.write("\n \n")
    os.startfile("C://Users//rakesh chandra//Desktop//ATS//outputU.txt")
def get_summaries_from_list_of_abstracts(list_of_abstracts, summarizer_type):

    if summarizer_type == 'lsa':
        summarizer = LsaSummarizer(Stemmer("english"))
    elif summarizer_type == 'luhn':
        summarizer = LuhnSummarizer(Stemmer("english"))
    elif summarizer_type == 'lexrank':
        summarizer = LexRankSummarizer(Stemmer("english"))
    elif summarizer_type == 'textrank':
        summarizer = TextRankSummarizer(Stemmer("english"))

    summarizer.stop_words = get_stop_words("english")

    list_of_summaries = []

    for abstract in list_of_abstracts:
        parser = PlaintextParser(abstract, Tokenizer("english"))
        summary = summarizer(parser.document, 3)
        summary_string = " ".join(map(str, summary))
        list_of_summaries.append(summary_string)

    print(list_of_summaries)

    return list_of_summaries
def summarizer_lsa(text, url):
    LANGUAGE = "english"
    SENTENCES_COUNT = 3
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    sum_output = u""
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer_1(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sum_output += u" ".join(sentence.words)
        sum_output += u". "
    return sum_output
Exemplo n.º 16
0
def summarize():
    """ Returns summary of articles """
    text = request.form['text']
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    final = []

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        final.append(str(sentence))
    return jsonify(summary=final)
Exemplo n.º 17
0
def load():
    LANGUAGE = "chinese"
    SENTENCES_COUNT = 5
    # article_max_len=500
    tt=tkitText.Text()
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    jieba.load_userdict('dict.txt')
    jieba.analyse.set_stop_words('stopwords.txt')
    textrank = jieba.analyse.textrank

    w2v=tkitW2vec.Word2vec()
    w2v.load(model_file=Word2vec_model)
Exemplo n.º 18
0
    def summarize_doc(self, bullets=3):
        parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        master = ""
        sentence_counter = 0
        for sentence in summarizer(parser.document, bullets):
            print(sentence)
            master = str(master) + str(sentence)
            sentence_counter += 1

        print(sentence_counter)
        return master
Exemplo n.º 19
0
    def getSummary(self):

        LANGUAGE = "english"
        SENTENCES_COUNT = 5

        parser = PlaintextParser.from_string(self.extractor.text(), Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        summary = ""
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary = "{} {}".format(summary, sentence)
        return summary
Exemplo n.º 20
0
def get_summary(text):
    LANGUAGE = "english"
    SENTENCES_COUNT = 20

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    text = text.replace("...", ".")
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    text = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        text = text + " " + str(sentence)

    return text
Exemplo n.º 21
0
def text_summary(text, summary_size=0.25):
    LANGUAGE = "english"
    result = ""
    SENTENCES_COUNT = math.floor(summary_size * len(sent_tokenize(text)))

    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result = result + " " + str(sentence)

    return result
Exemplo n.º 22
0
    def summarize(self, paragraphs):
        """Summarize content with the sumy library"""

        content = '\n\n'.join(paragraphs)

        stemmer = Stemmer(LANGUAGE)
        summarizer = LsaSummarizer(stemmer)

        tokenizer = Tokenizer(LANGUAGE)
        parser = PlaintextParser.from_string(content, tokenizer)

        sentences = []
        for sentence in summarizer(parser.document, '10%'):
            sentences.append([str(sentence)])
        return sentences
Exemplo n.º 23
0
def process3(ex):
    kt = []
    LANGUAGE = "english"
    SENTENCES_COUNT = ex
    parser = PlaintextParser.from_file("inputtext.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        kt.append(sentence)
    print(time.clock() - start_time, "seconds")

    return kt
Exemplo n.º 24
0
def prepare_summary(query):
    sentences = []
    text = query['text']
    number_of_sentences = query['numberOfSentences']

    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, number_of_sentences):
        sentences.append(sentence._text)

    return sentences
Exemplo n.º 25
0
def summarize_file(file_name):
	#url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
	#parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
	# or for plain text files
	parser = PlaintextParser.from_file(file_name, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)

	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	
	sentences = summarizer(parser.document, SENTENCES_COUNT)
	list_sentences = []
	for sentence in sentences:
		list_sentences.append(str(sentence))
	return list_sentences
Exemplo n.º 26
0
def update_summary(n_clicks, dropdown_language, sentences_count,
                   summarizer_opt, text_area):
    """Update textbox with summary.

    Parameters must be passed in the same order as Inputs and State
    in the callback decorator.

    Parameters
    ----------
    n_clicks : int
        Button click: 0 if unclicked, 1 if clicked.
    dropdown : str
        Value (language) selected in the dropdown menu.
    sentences_count : int
        Number of sentences in the summary.
    text_area : str
        Input text: can be URL or plain text.

    Returns
    -------
    str
        Summary of the text, once the button is pressed.

    """
    # Button is clicked
    if n_clicks > 0:

        # Summarize from URL
        if text_area.startswith('http'):
            parser = HtmlParser.from_url(text_area.strip(),
                                         Tokenizer(dropdown_language))

        # Summarize plain text
        else:
            parser = PlaintextParser.from_string(text_area,
                                                 Tokenizer(dropdown_language))

        stemmer = Stemmer(dropdown_language)

        summarizer = all_summarizers[summarizer_opt](stemmer)
        summarizer.stop_words = get_stop_words(dropdown_language)

        sentences = [
            str(sentence)
            for sentence in summarizer(parser.document, sentences_count)
        ]

        return '\n' + '\n\n'.join(sentences)
Exemplo n.º 27
0
def gen_sum(document, n, alg="LSA"):
    parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if alg == "LSA":
        return run_LSA(stemmer, parser.document)
    elif alg == "LexRank":
        return run_LexRank(stemmer, parser.document)
    elif alg == "TextRank":
        return run_TextRank(stemmer, parser.document)
    elif alg == "Luhn":
        return run_Luhn(stemmer, parser.document)
    elif alg == "SumBasic":
        return run_SumBasic(stemmer, parser.document, n)
    else:
        exit("Unkown extractive summarization algorithm!")
Exemplo n.º 28
0
def summarize_text(request):
    if request.html:
        parser = HtmlParser.from_file(file_path=request.html,
                                      url=request.url,
                                      tokenizer=Tokenizer(LANGUAGE))
    else:
        parser = PlaintextParser.from_file(file_path=request.html,
                                           tokenizer=Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)]
    html = generate_html(sentences, fix_text(request.title)).render()
    request.send_html(html)
Exemplo n.º 29
0
    def summarize(self, method='luhn'):
        """ Summarize text """

        method = self._check_method(method)

        if self.url:
            parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE))
        elif self.html:
            parser = HtmlParser(self.html, Tokenizer(self.LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = method(stemmer)
        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        sumy = summarizer(parser.document, self.SENTENCES_COUNT)
        summary = ''.join([str(i) for i in list(sumy)])

        return summary
Exemplo n.º 30
0
def summarized():
    # Requests data in forms
    URL = request.form['url-field']
    SENTENCES_COUNT = request.form['quantity']
    LANGUAGE = "english"

    # Summarization
    parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    list = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        list.append(sentence)

    return list