Exemplo n.º 1
0
Arquivo: iatv.py Projeto: mtpain/iatv
def summarize(text, n_sentences, sep='\n'):
    '''
    Args:
        text (str or file): text itself or file in memory of text
        n_sentences (int): number of sentences to include in summary

    Kwargs:
        sep (str): separator to join summary sentences

    Returns:
        (str) n_sentences-long, automatically-produced summary of text
    '''

    if isinstance(text, str):
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif isinstance(text, file):
        parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE))
    else:
        raise TypeError('text must be either str or file')

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
Exemplo n.º 2
0
 def test_split_into_words(self):
     sentences1 = PlaintextParser.from_string("One, two two. Two. Three.", 
         Tokenizer("english")).document.sentences
     self.assertEqual(["One", "two", "two", "Two", "Three"], 
         _split_into_words(sentences1))
     
     sentences2 = PlaintextParser.from_string("two two. Two. Three.", 
         Tokenizer("english")).document.sentences
     self.assertEqual(["two", "two", "Two", "Three"], 
         _split_into_words(sentences2))
Exemplo n.º 3
0
    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
Exemplo n.º 4
0
def summarizeFile(inputFile):
	summarizer = LsaSummarizer(stem_word)
	summarizer.stop_words = get_stop_words("english")
	url = findURLS(inputFile)
	if url != None:
		if url[-1] == '.':
			url = url[0:-1]
		#print (url)
		#urlContent = 'Summary from URL ['+url+']: \n'
		urlContent = ''
		try:
			parser = HtmlParser.from_url(url, Tokenizer("english"))		
			for sentence in summarizer(parser.document, 3):
				urlContent = urlContent + str(sentence) + '\n'
		except:
			#print (sys.exc_info()[0])
			urlContent = ''
	content = inputFile.read()
	parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
	#summarizer = LsaSummarizer(stem_word)
	#summarizer.stop_words = get_stop_words(LANGUAGE)
	#summary = 'Event Summary: \n'
	summary = ''
	try:
		for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
			summary = summary + str(sentence) + '\n'
	except AssertionError:
		return None
	if url != None:
		return summary + urlContent
	return summary
def get_summary(source_text, compression_factor):
    """
    Given some input source_text, returns its summary based on the chosen 
    compression factor.
    """
    summary = {
        'source_text': source_text,
        'compression_factor': compression_factor,
        'summary': '',
        'success': False
    }
    
    parser = PlaintextParser.from_string(source_text, Tokenizer("english"))
    summ_algo = LexRankSummarizer()
    final_line_num = \
        int(source_text.count('.')/compression_factor)
    try:
        raw_summary = summ_algo(parser.document, final_line_num)
        for sentence in raw_summary:
            summary['summary'] += str(sentence) + ' '
    except:
        pass

    summary['success'] = (len(summary['summary']) != 0)

    return summary
Exemplo n.º 6
0
 def test_get_word_ngrams(self):
     sentences = PlaintextParser.from_string("This is a test.", 
         Tokenizer("english")).document.sentences
     correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")]
     found_ngrams = _get_word_ngrams(2, sentences)
     for ngram in correct_ngrams:
         self.assertTrue(ngram in found_ngrams)      
Exemplo n.º 7
0
def _firstK_score(storyName, highlightName):
    parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))

    geneSen = parser.document.sentences[:SENTENCES_COUNT]
    refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences

    # print geneSen
    # print "=========="
    # print refSen
    # print evaluate(geneSen, refSen)
    try:
        return evaluate(geneSen, refSen)
    except Exception as e:
        print storyName
        print e
        raise e
Exemplo n.º 8
0
    def summarize(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary
Exemplo n.º 9
0
def kl_rank_sum(path, K):
    filename = path
    K = K
    parser = PlaintextParser.from_file(filename, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, K) #number of sentences in parenthecies
    return summary
Exemplo n.º 10
0
def summarize(corpus, length, algorithm):
    summarizer = None
    summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)"
    algorithm = algorithm.lower()
    try:
        parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE))
        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(LANGUAGE))

        if summarizer:
            summarizer.stop_words = get_stop_words(LANGUAGE)
            summary = " ".join([obj._text for obj in summarizer(parser.document, length)])

        return summary

    except Exception as e:
        return str(e)
Exemplo n.º 11
0
def summarize(string, summary_length = 1, language = "english"):
    string = string.lower() if string.isupper() else string
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)

    return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)]) 
Exemplo n.º 12
0
def sumrise(text = text, sentences = 5):
    if (validators.url(text)): text = web2text.getwebtxt(text)

    parser = PlaintextParser.from_string(text, Tokenizer('english'))
    summerizer = LsaSummarizer()

    summary = str(summerizer(parser.document, sentences))
    return summary
Exemplo n.º 13
0
    def summarize(self, extracted_refs, facet_results, max_length=250, mode='citance'):
        '''
        Summarizes the extracted references based on community detection

        Args:
            extracted_refs(list) -- results of the method.run (e.g. simple.py)
            facet_results(dict) -- facets for each extracted reference
                Look at data/task1b_results1.json
            max_length(int) -- maximum length of the summary
            mode(str) -- can be citance, reference 

        '''
        citances = defaultdict(list)
        summarizer = LexRankSummarizer(Stemmer('english'))
        summary = defaultdict(lambda: defaultdict(list))
        for t in extracted_refs:
            citances[t[0]['topic']].append(
                {'refs': t[0]['sentence'],
                 'citance': self.clean_citation(t[0]['citation_text'])})

        for topic, citance in citances.iteritems():
            # Create graph of citation similarities
            vectorizer = TfidfVectorizer(
                tokenizer=self.tokenize, min_df=1, max_df=len(citances) * .9)
            cit_vectors = vectorizer.fit_transform(
                [e['citance'] for e in citance]).toarray()
            cit_text = {
                i: v for i, v in enumerate(citance)}
            cit_dict = {i: v for i, v in enumerate(cit_vectors)}
            cits = []
            for e in cit_dict:  # vector (numpy array)
                for e1 in cit_dict:
                    if e != e1:
                        simil = self.cossim(cit_dict[e],
                                            cit_dict[e1])
                        if simil > 0.1:
                            cits.append((e, e1, simil))
            G = nx.Graph()
            G.add_weighted_edges_from(cits)
            part = community.best_partition(G)
            clusters = defaultdict(list)
            tokenize = SentTokenizer(offsets=False)
            for k, v in part.iteritems():
                clusters[v].extend(tokenize(citance[k]['refs']))
            # clusters includes ref sentences that belong in each cluster
            # Find the most salient sentence in each cluster
            sal_in_cluster = {}  # salient sentences for each cluster
            for i in clusters:
                parser = PlaintextParser.from_string(
                    ' '.join(clusters[i]).replace('\\', ''), Tokenizer('english'))
                summ = summarizer(parser.document, 5)
                # 5 is the number of sentences returned by LexRank
                sal_in_cluster[i] = [unicode(s) for s in summ]
                # The most salient sentences in each cluster
            summary[topic.upper()] =\
                self.pick_from_cluster(
                    sal_in_cluster, max_length, weighted=False)
        return summary
Exemplo n.º 14
0
def summary():
    max_sent = 10
    language = 'english'
    url = request.form['summary']
    tokenizer = Tokenizer(language)
    article = alt_extract(url)
    parser = PlaintextParser.from_string(article, tokenizer)
    summary = summarizer(parser, max_sent, language).decode('utf-8')
    return render_template('summary.html', url=url, summary=summary)
Exemplo n.º 15
0
    def summarize(self, extracted_refs, facet_results, max_length=250):
        '''
        Summarizes the extracted references based on the facet results

        Args:
            extracted_refs(list) -- results of the method.run (e.g. simple.py)
            facet_results(dict) -- facets for each extracted reference
                Look at data/task1b_results1.json
            max_length(int) -- maximum length of the summary
        '''
        summaries = defaultdict(lambda: defaultdict(list))
        for t in extracted_refs:
            topic = t[0]['topic']
            citance = t[0]['citance_number']
            if isinstance(t[0]['sentence'][0], list):
                logger.warn('Unexpected, should check')
            summaries[topic.upper()]\
                [facet_results[topic.upper()]
                 [str(citance)]['SVM_LABEL']].append([t[0]['citation_text']])

        summarizer = TextRankSummarizer(Stemmer('english'))

        final_summ = defaultdict(lambda: defaultdict(dict))
        ret_summ = defaultdict(list)
        counts = defaultdict(lambda: defaultdict(dict))
        for t in summaries:
            for facet in summaries[t]:
                if len(summaries[t][facet]) > 1:
                    summs = list(
                        itertools.chain.from_iterable(summaries[t][facet]))
                    parser = PlaintextParser.from_string(
                        ' '.join(summs), Tokenizer('english'))
                    summ = summarizer(parser.document, max_length)
                    final_summ[t][facet] = [unicode(sent) for sent in summ]
                    counts[t][facet] = len(final_summ[t][facet])
                else:
                    final_summ[t][facet] = self.s_t(summaries[t][facet][0])
            i = 0
            while self.w_t.count_words(ret_summ[t]) < max_length:
                for fct in final_summ[t]:
                    if i < len(final_summ[t][fct]):
                        ret_summ[t].append(final_summ[t][fct][i])
                i += 1
            while self.w_t.count_words(ret_summ[t]) > max_length:
                ret_summ[t].pop()


#         summ = defaultdict(list)
#         tokzer = WordTokenizer(stem=False)
#         for k in final_summ:
#             i = 0
#             while tokzer.count_words(summ[k]) < max_length:
#                 for f in final_summ[k]:
#                     if len(final_summ[k][f]) > i and\
#                             tokzer.count_words(summ[k]) < max_length:
#                         summ[k].append(final_summ[k][f][i])
        return ret_summ
Exemplo n.º 16
0
def summarize(text):
    total = ""
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        total += str(sentence)
    return total
 def getSummary(self, num_sentences):
     lex_rank = LexRankSummarizer()
     text = str(self.bpLargGetText())
     parser = PlaintextParser.from_string(text, Tokenizer('english'))
     summary = lex_rank(parser.document, num_sentences)
     sentences = []
     for sent in summary:
         sentences.append(str(sent))
     return sentences
Exemplo n.º 18
0
def summary(text, summarizer_class):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = summarizer_class(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        yield sentence
Exemplo n.º 19
0
	def get_summary(self, text):
		parser = PlaintextParser.from_string(text, Tokenizer("english"))
		summarizer = LexRankSummarizer()
		summary = summarizer(parser.document, 3) #Summarize the document with 5 sentences

		result = ""
		for sentence in summary:
			result += " " + str(sentence)

		return result
Exemplo n.º 20
0
 def summarizeText(self, body, numSentences = 10):
     """Summarizes body of text to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
def summarize(filename, num_sentences):
    with open (filename, "r") as myfile:
        data=myfile.read()
    parser = PlaintextParser.from_string(data, Tokenizer('english')) 
    summarizer = LsaSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("english")
    summary = ""
    for sentence in summarizer(parser.document, num_sentences):
        summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " 
    return summary
Exemplo n.º 22
0
def summarize(content):
    parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    text = '\n'.join(
        [str(sentence) for sentence in summarizer(parser.document, COUNT)]
    )
    summary = Summary(content=content, summary=text)
    summary.save()
Exemplo n.º 23
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"),
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
Exemplo n.º 24
0
def test_article_example():
    """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
    parser = PlaintextParser.from_string(
        load_resource("articles/prevko_cz_1.txt"),
        Tokenizer("czech")
    )
    summarizer = LsaSummarizer(Stemmer("czech"))
    summarizer.stop_words = get_stop_words("czech")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
Exemplo n.º 25
0
def summarize(text):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    result = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result += str(sentence) + " "

    return result
Exemplo n.º 26
0
    def test_issue_5_sigma_can_multiply_matrix_v(self):
        """Source: https://github.com/miso-belica/sumy/issues/5"""
        parser = PlaintextParser.from_string(
            load_resource("articles/sigma_can_multiply_matrix_v.txt"),
            Tokenizer("english")
        )
        summarizer = LsaSummarizer(english_stemmer)
        summarizer.stop_words = get_stop_words("english")

        sentences = summarizer(parser.document, 20)
        self.assertEqual(len(sentences), 20)
Exemplo n.º 27
0
def summarize(text, size=2):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer = LexRankSummarizer()

	summary = summarizer(parser.document, size)
	summarize_text=""
	for sentence in summary:
	    summarize_text+=(str(sentence)+" ")
	    
	summarize_text=summarize_text.strip()
	return summarize_text 
Exemplo n.º 28
0
def lex_rank_sum(path, L):
    filename = path
    L = L
    output = []
    parser = PlaintextParser.from_file(filename, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, L) #number of sentences in parenthecies
    for sentence in summary: # option for writing to a summary output file.
        item = str(sentence)
        output.append(item)
    return output
Exemplo n.º 29
0
def _summ_score(storyName, highlightName):
    parser = PlaintextParser.from_file(storyName, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    geneSen = summarizer(parser.document, SENTENCES_COUNT)
    refSen = PlaintextParser.from_file(highlightName, Tokenizer(LANGUAGE)).document.sentences


    #print geneSen
    #print "=========="
    #print refSen
    try:
        return evaluate(geneSen, refSen)
    except Exception as e:
        print storyName
        print e
        raise e
Exemplo n.º 30
0
def get_summary(text, max_sentences=5):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	stemmer = Stemmer("english")

	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words("english")

	summary = []
	for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10
		summary.append(str(sentence._text.encode('ascii', 'ignore')))

	return summary
Exemplo n.º 31
0
def main(args=None):
    summarizer, document, items_count, reference_summary = handle_arguments()

    evaluated_sentences = summarizer(document, items_count)
    reference_document = PlaintextParser.from_string(reference_summary,
                                                     Tokenizer(language))
    reference_sentences = reference_document.document.sentences

    for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS:
        if evaluate_document:
            result = evaluate(evaluated_sentences, document.sentences)
        else:
            result = evaluate(evaluated_sentences, reference_sentences)
        print("%s: %f" % (name, result))

    return 0
Exemplo n.º 32
0
    def __init__(self, transcript_file_path, summary_number):
        """ Input a transcript_file_path in the form of a string and a
			summary_number denoting the number of sentences requested in the summary.
		"""
        self.transcript_file = transcript_file_path
        full_transcript_text = file.read(open(self.transcript_file, "r"))
        self.tokenized_transcript = sent_tokenize(full_transcript_text)

        LANGUAGE = "English"
        parser = PlaintextParser.from_file(self.transcript_file,
                                           Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        self.summary = summarizer(parser.document, summary_number)
Exemplo n.º 33
0
    def summarize(self, fields):
        """
        yields the summary on a hit to facilitate building bulk update
        """
        assert self.content_field in fields
        content = fields[self.content_field][0]
        language = fields[self.lang_field][0] if self.lang_field in fields else 'en'
        language = LANGUAGE_MAP[language]
        parser = PlaintextParser.from_string(content, Tokenizer(language))
        stemmer = Stemmer(language)
        summarizer = LexRankSummarizer(stemmer)
        summarizer.stop_words = get_stop_words(language)

        sentences = [str(s) for s in summarizer(parser.document, self.count)]
        summary = ' '.join(sentences)
        return summary
Exemplo n.º 34
0
def extract_summary_keywords(trend,urls,titles):  
	total_articles_content=extract_text(urls)
	keywords=extract_keywords_from_all_text(total_articles_content,titles)
	current_path=os.path.dirname(os.path.realpath(__file__))
	current_path=current_path+'\\'+trend+'.txt'
	with open(current_path, 'w') as the_file:
	 	the_file.write(total_articles_content)
	parser = PlaintextParser.from_file(current_path, Tokenizer(LANGUAGE))
	os.remove(current_path)
	sentences=''
	for sentence in summarizer(parser.document, 12):
		sentences=sentences+' '+str(sentence) 
	replaced_syn=replacesynonym(sentences)
	matches = tool.check(sentences)
	correct_summary=language_check.correct(sentences, matches)
	return correct_summary,keywords
Exemplo n.º 35
0
def summarize():
    """ Returns summary of articles """
    text = request.form['text']
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    final = []

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        final.append(str(sentence))
    length = len(final)
    return render_template('paraphrase.html', report=final, length=length)
Exemplo n.º 36
0
def summy_lex_rank_process_article_file(file_path):
    sents = []
    with io.open(file_path, 'r', encoding='utf-8') as article_file:
        for line in article_file:
            if line.find('@highlight') != -1:
                break
            line = line.strip()
            sents.extend(sent_tokenize(line))
    parser = PlaintextParser.from_string(' '.join(sents), Tokenizer('english'))
    summarizer = LexRankSummarizer()
    # Summarize the document with 2 sentences
    sums = summarizer(parser.document, NUM_SUM_SENTS)
    res_list = []
    for summary in sums:
        res_list.append(str(summary))
    return res_list
Exemplo n.º 37
0
def test_document_is_all_in_upper_case():
    """
    When all words is in upper case Plaintext parser first line as heading and
    LexRank algorithm raises exception "ZeroDivisionError: float division by zero"
    because there is no sentence to summarize.
    See https://github.com/miso-belica/sumy/issues/25
    """
    parser = PlaintextParser.from_string(
        "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.",
        Tokenizer("english")
    )
    summarizer = LexRankSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 0
Exemplo n.º 38
0
    def get_summary(self):
        # 1行1文となっているため、改行コードで分離
        sentences = [t for t in self._text.split('\n')]
        for i in range(1):
            print(sentences[i])

        # 形態素解析器を作る
        analyzer = Analyzer(
            [
                UnicodeNormalizeCharFilter(),
                RegexReplaceCharFilter(r'[(\)「」、。]', ' ')
            ],  # ()「」、。は全てスペースに置き換える
            JanomeTokenizer(),
            [
                POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
                ExtractAttributeFilter('base_form')
            ]  # 名詞・形容詞・副詞・動詞の原型のみ
        )

        # 抽出された単語をスペースで連結
        # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。
        corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences]
        """
		for i in range(2):
		    print(corpus[i])
		"""
        # 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。
        # 今年 11 月 SIer Web サービス 会社 転職 する。
        """
		from sumy.parsers.plaintext import PlaintextParser
		from sumy.nlp.tokenizers import Tokenizer
		from sumy.summarizers.lex_rank import LexRankSummarizer
		"""

        # 連結したcorpusを再度tinysegmenterでトークナイズさせる
        parser = PlaintextParser.from_string(''.join(corpus),
                                             Tokenizer('japanese'))

        # LexRankで要約を2文抽出
        summarizer = LexRankSummarizer()
        summarizer.stop_words = [' ']  # スペースも1単語として認識されるため、ストップワードにすることで除外する

        self.summary = summarizer(document=parser.document, sentences_count=2)

        # 元の文を表示
        for sentence in self.summary:
            print(sentences[corpus.index(sentence.__str__())])
Exemplo n.º 39
0
def summarize(selected_text, n=3):
    from sumy.parsers.plaintext import PlaintextParser  #We're choosing a plaintext parser here, other parsers available for HTML etc.
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lex_rank import LexRankSummarizer  #We're choosing Lexrank, other algorithms are also built in

    output = ''

    parser = PlaintextParser(selected_text, Tokenizer("english"))
    summarizer = LexRankSummarizer()

    summary = summarizer(parser.document,
                         n)  #Summarize the document with 2 sentences

    for sentence in summary:
        output += str(sentence)

    return output
Exemplo n.º 40
0
def generating_transcript(para):
    for i in range(len(para)):
        f1 = open("f1.txt", "w+")
        f1.write(para[i].text + "\n")
        f1.close()
        parser = PlaintextParser.from_file("f1.txt", Tokenizer("english"))
        summarizer = LexRankSummarizer()
        summary = summarizer(parser.document, 2)
        for sentence in summary:
            if(len(str(sentence)) > 30):
                temp += str(sentence) + "\n\n"

    f1 = open("f1.txt", "w+")
    f1.write(temp)
    f1.close()

    return temp
Exemplo n.º 41
0
def boring():
    boringStuff = request.form['boringstuff']
    parser = PlaintextParser.from_string(request.form['boringstuff'],
                                         Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentencesCount = request.form['sentences'] if request.form[
        'sentences'] else 3

    sentences = summarizer(parser.document, sentencesCount)
    return render_template('index.html',
                           sentences=sentences,
                           sentencesCount=sentencesCount,
                           boringStuff=boringStuff)
Exemplo n.º 42
0
def models_LUHN_LEX_LSA_2(dataframe):
    ##    Candidate models:
    #        Bag of Words
    #        FastText
    #        word2vec
    #        LDA (topic extra)
    #        skip-thoughts
    #        doc2vec
    #        LSTM

    LANGUAGE = "english"
    stop = get_stop_words(LANGUAGE)
    size = len(dataframe)
    stemmer = Stemmer(LANGUAGE)

    for i in range(0, size):
        article = dataframe.loc[i, "post_content"]

        parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE))

        summarizerLUHN = LUHN(stemmer)
        summarizerLUHN.stop_words = stop

        summarizerLEX = LEX(stemmer)
        summarizerLEX.stop_words = stop

        summarizerLSA = LSA(stemmer)
        summarizerLSA.stop_words = stop

        LUHNsentence = summarizerLUHN(parser.document,
                                      1)  #summarize document with one sentence
        LEXsentence = summarizerLEX(parser.document,
                                    1)  #summarize document with one sentence
        LSAsentence = summarizerLSA(parser.document,
                                    1)  #summarize document with one sentence

        for sentence1 in LUHNsentence:
            LUHNsummary = sentence1
        for sentence2 in LEXsentence:
            LEXsummary = sentence2
        for sentence3 in LSAsentence:
            LSAsummary = sentence3

        dataframe.loc[i, "LUHN"] = LUHNsummary
        dataframe.loc[i, "LEX"] = LEXsummary
        dataframe.loc[i, "LSA"] = LSAsummary
Exemplo n.º 43
0
    def summarize(self, excerpt: str, len_s: int) -> str:
        parser = PlaintextParser.from_string(excerpt, Tokenizer('english'))

        document = parser.document
        dictionary = self.summarizer._create_dictionary(document)

        if dictionary is None:
            return excerpt

        words_count = len(dictionary)
        sentences_count = len(document.sentences)
        if words_count < sentences_count:
            return excerpt

        sents = self.summarizer(parser.document, len_s)

        return ' '.join(str(s) for s in sents)
Exemplo n.º 44
0
    def run_single(self, document):
        parser = PlaintextParser.from_string(document, Tokenizer(self.language))
        document = parser.document

        self.summarizer._ensure_dependencies_installed()

        sentences_words = [self.summarizer._to_words_set(s) for s in document.sentences]
        if not sentences_words:
            return tuple()

        tf_metrics = self.summarizer._compute_tf(sentences_words)
        idf_metrics = self.summarizer._compute_idf(sentences_words)

        matrix = self.summarizer._create_matrix(sentences_words, self.summarizer.threshold, tf_metrics, idf_metrics)
        scores = self.summarizer.power_method(matrix, self.summarizer.epsilon)

        return list(map(str, document.sentences)), list(scores)
Exemplo n.º 45
0
def videoFunction(search, video_sentence_count, no_video):
    results = YoutubeSearch(search, max_results=no_video).to_json()
    results = json.loads(results)
    filtered_results = []
    for result in results['videos']:
        v = result['id']
        transcript_flag = True
        try:
            transcript_list = YouTubeTranscriptApi.list_transcripts(v)
        except:
            print("no transcripts")
            transcript_flag = False

        if transcript_flag:
            try:
                transcript = transcript_list.find_manually_created_transcript(
                    ['en', 'en-UK', 'en-US'])
            except:
                print("no transcript")
                transcript_flag = False

        if transcript_flag:
            transcript_proto = transcript.fetch()
            final_transcript = ''
            for obj in transcript_proto:
                final_transcript += obj['text'] + ' '

            key_words = keywords(final_transcript, ratio=0.1)

            parser = PlaintextParser.from_string(final_transcript,
                                                 Tokenizer("english"))
            stemmer = Stemmer(LANGUAGE)

            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(LANGUAGE)

            summary = ''
            for sentence in summarizer(parser.document, video_sentence_count):
                summary += str(sentence)

            result['summary'] = summary
            result['id'] = result['id'].split('&')[0]
            result['keywords'] = key_words
            filtered_results.append(result)

    return filtered_results
Exemplo n.º 46
0
def test_real_example():
    parser = PlaintextParser.from_string(
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
        "Přerostly až v reparát z jazyka na konci školního roku. "
        "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
        "o rok mladších dětí budoval vedoucí pozici. "
        "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
        Tokenizer("czech"))
    summarizer = LuhnSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("czech")

    returned = summarizer(parser.document, 2)
    assert list(map(to_unicode, returned)) == [
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.",
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.",
    ]
Exemplo n.º 47
0
def response():
    if request.method == 'POST':
        text_org = request.json['foo']
        text = json.loads(json.dumps(text_org))
        text = re.sub('[^A-Za-z0-9()[]]', ' ', str(text))
        text = text.lower()
        if len(text.split()) <= 3:
            resp = ' '.join(['please give some more sentences.'])
            return resp
        else:
            parser = PlaintextParser.from_string(text, Tokenizer('english'))
            sum_1 = summarizer(parser.document, 10)
            sum_lex = []
            for sent in sum_1:
                resp = sum_lex.append(str(sent))
                resp = ' '.join(resp)
            return resp
Exemplo n.º 48
0
def summarize():
    SENTENCES_COUNT = numOfSent.get()
    parser = PlaintextParser.from_file(fileName.cget("text"),
                                       Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    outputFile = open("C://Users//rakesh chandra//Desktop//ATS//output.txt",
                      'w')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        outputFile.write("->  ")
        outputFile.write(str(sentence))
        outputFile.write("\n \n")
    os.startfile((fileName.cget("text")))
    os.startfile("C://Users//rakesh chandra//Desktop//ATS//output.txt")
Exemplo n.º 49
0
def summarization(text, alg="lexrank"):
    parser = PlaintextParser.from_string("".join(text), Tokenizer("japanese"))

    if alg == "lexrank":
        summarizer = LexRankSummarizer()
    elif alg == "textrank":
        summarizer = TextRankSummarizer()
    elif alg == "lsa":
        summarizer = LsaSummarizer()
    else:
        raise Exception("IllegalArgumentException")

    summarizer.stop_words = [" "]
    abst = summarizer(document=parser.document, sentences_count=5)
    abst = [x for x in map(lambda x: "".join(x.words), abst)]

    return abst