Пример #1
0
def test_tokenize_japanese_sentence():
    tokenizer = Tokenizer('japanese')
    assert tokenizer.language == 'japanese'

    sentence = 'この文章を、正しくトークン化したい。'
    expected = ('この', '文章', 'を', '正しく', 'トークン', '化', 'し', 'たい')
    assert expected == tokenizer.to_words(sentence)
Пример #2
0
def test_tokenize_chinese_sentence():
    tokenizer = Tokenizer('chinese')
    assert tokenizer.language == 'chinese'

    sentence = '好用的文档自动化摘要程序。'
    expected = ('好用', '的', '文档', '自动化', '摘要', '程序')
    assert expected == tokenizer.to_words(sentence)
Пример #3
0
def test_tokenize_sentence():
    tokenizer = Tokenizer("english")
    words = tokenizer.to_words("I am a very nice sentence with comma, but..")

    expected = (
        "I", "am", "a", "very", "nice", "sentence",
        "with", "comma",
    )
    assert expected == words
Пример #4
0
def test_tokenize_japanese_paragraph():
    tokenizer = Tokenizer('japanese')
    expected = (
        '1つ目の文章です。',
        'その次は何が来ますか?',
        '「2つ目の文章」です。'
    )
    paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。'
    assert expected == tokenizer.to_sentences(paragraph)
Пример #5
0
def test_tokenize_chinese_paragraph():
    tokenizer = Tokenizer('chinese')
    expected = (
        '我正在为这个软件添加中文支持。',
        '这个软件是用于文档摘要!',
        '这个软件支持网页和文本两种输入格式?'
    )

    paragraph = '我正在为这个软件添加中文支持。这个软件是用于文档摘要!这个软件支持网页和文本两种输入格式?'
    assert expected == tokenizer.to_sentences(paragraph)
Пример #6
0
def test_tokenize_paragraph():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences("""
        I am a very nice sentence with comma, but..
        This is next sentence. "I'm bored", said Pepek.
        Ou jee, duffman is here.
    """)

    expected = (
        "I am a very nice sentence with comma, but..",
        "This is next sentence.",
        '"I\'m bored", said Pepek.',
        "Ou jee, duffman is here.",
    )
    assert expected == sentences
Пример #7
0
def test_ensure_czech_tokenizer_available():
    tokenizer = Tokenizer("czech")
    assert "czech" == tokenizer.language

    sentences = tokenizer.to_sentences("""
        Měl jsem sen, že toto je sen. Bylo to také zvláštní.
        Jakoby jsem plaval v moři rekurze.
    """)

    expected = (
        "Měl jsem sen, že toto je sen.",
        "Bylo to také zvláštní.",
        "Jakoby jsem plaval v moři rekurze.",
    )
    assert expected == sentences
Пример #8
0
    def test_ensure_czech_tokenizer_available(self):
        tokenizer = Tokenizer("czech")
        self.assertEqual("czech", tokenizer.language)

        sentences = tokenizer.to_sentences("""
            Měl jsem sen, že toto je sen. Bylo to také zvláštní.
            Jakoby jsem plaval v moři rekurze.
        """)

        expected = (
            "Měl jsem sen, že toto je sen.",
            "Bylo to také zvláštní.",
            "Jakoby jsem plaval v moři rekurze.",
        )
        self.assertEqual(expected, sentences)
Пример #9
0
    def test_slovak_alias_into_czech_tokenizer(self):
        tokenizer = Tokenizer("slovak")
        self.assertEqual(tokenizer.language, "slovak")

        sentences = tokenizer.to_sentences("""
            Je to veľmi fajn. Bodaj by nie.
            Ale na druhej strane čo je to oproti inému?
            To nechám na čitateľa.
        """)

        expected = (
            "Je to veľmi fajn.",
            "Bodaj by nie.",
            "Ale na druhej strane čo je to oproti inému?",
            "To nechám na čitateľa.",
        )
        self.assertEqual(expected, sentences)
Пример #10
0
def test_slovak_alias_into_czech_tokenizer():
    tokenizer = Tokenizer("slovak")
    assert tokenizer.language == "slovak"

    sentences = tokenizer.to_sentences("""
        Je to veľmi fajn. Bodaj by nie.
        Ale na druhej strane čo je to oproti inému?
        To nechám na čitateľa.
    """)

    expected = (
        "Je to veľmi fajn.",
        "Bodaj by nie.",
        "Ale na druhej strane čo je to oproti inému?",
        "To nechám na čitateľa.",
    )
    assert expected == sentences
Пример #11
0
def lexrank_summarize(input_dict):
    """Return the summarized text string given the raw_dict from get_raw_dict()
    
    input: 
    input_dict: dictionary containing document content under "RAW_TEXT" key (first input from get_raw_dict())
    """
    summarizer_lr = LexRankSummarizer()
    lr_fulltext = ""

    for key, value in input_dict.items():
        text = value["RAW_TEXT"]
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        summary = summarizer_lr(parser.document, 1)[0]
        lr_fulltext += str(summary) + " "
    return lr_fulltext
Пример #12
0
 def summarize(self, documents, posts, max_sents):
     # print('Preprecesing...')
     documents = self.preprocess(documents)
     summaries = []
     for doc in documents:
         doc = doc.replace('\n', ' ').replace('  ', ' ').replace(
             '\r', ' ').replace('  ', ' ')
         parser = PlaintextParser.from_string(doc, Tokenizer('english'))
         summary = ''
         summary_sents = self.summarizer(parser.document, max_sents)
         for sentence in summary_sents:
             # print(sentence)
             summary += sentence._text + ' '
         summaries.append(summary.strip())
     return summaries
Пример #13
0
 def summary_from_text(self, text, count):
     list = []
     article = ''
     try:
         parser = PlaintextParser.from_string(text,
                                              Tokenizer(self.language))
         self.save_article(parser)
         article = text
         for sentence in self.summarizer(parser.document, count):
             list.append(str(sentence))
     except (LinAlgError):
         list.append(text)
     except:
         list = []
     return article, list
Пример #14
0
def summarize():
    """ Returns summary of articles """
    text = request.form['text']
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    final = []

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        final.append(str(sentence))
    return jsonify(summary=final)
Пример #15
0
    def summarize_doc(self, bullets=3):
        parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        master = ""
        sentence_counter = 0
        for sentence in summarizer(parser.document, bullets):
            print(sentence)
            master = str(master) + str(sentence)
            sentence_counter += 1

        print(sentence_counter)
        return master
Пример #16
0
def slititle(sena):

    # Creates a temporary text file since sumy works that way
    with open('Titlefile___.txt', 'w') as tefil:
        tefil.write(sena)

    thefile = "Titlefile___.txt"  #name of the plain-text file
    parser = PlaintextParser.from_file(thefile, Tokenizer("english"))
    summarizer = LsaSummarizer()

    summary = summarizer(parser.document,
                         1)  # Reduce the document to 1 sentence
    os.remove('Titlefile___.txt')

    return str(summary[0]).split('.')[0]
Пример #17
0
    def summarize(self, fields):
        """
        yields the summary on a hit to facilitate building bulk update
        """
        assert self.content_field in fields
        content = fields[self.content_field][0]
        language = fields[self.lang_field][0] if self.lang_field in fields else 'en'
        language = LANGUAGE_MAP[language]
        parser = PlaintextParser.from_string(content, Tokenizer(language))
        stemmer = Stemmer(language)
        summarizer = LexRankSummarizer(stemmer)
        summarizer.stop_words = get_stop_words(language)

        sentences = [str(s) for s in summarizer(parser.document, self.count)]
        summary = ' '.join(sentences)
        return summary
Пример #18
0
def extract_summary_keywords(trend,urls,titles):  
	total_articles_content=extract_text(urls)
	keywords=extract_keywords_from_all_text(total_articles_content,titles)
	current_path=os.path.dirname(os.path.realpath(__file__))
	current_path=current_path+'\\'+trend+'.txt'
	with open(current_path, 'w') as the_file:
	 	the_file.write(total_articles_content)
	parser = PlaintextParser.from_file(current_path, Tokenizer(LANGUAGE))
	os.remove(current_path)
	sentences=''
	for sentence in summarizer(parser.document, 12):
		sentences=sentences+' '+str(sentence) 
	replaced_syn=replacesynonym(sentences)
	matches = tool.check(sentences)
	correct_summary=language_check.correct(sentences, matches)
	return correct_summary,keywords
Пример #19
0
def summy_lex_rank_process_article_file(file_path):
    sents = []
    with io.open(file_path, 'r', encoding='utf-8') as article_file:
        for line in article_file:
            if line.find('@highlight') != -1:
                break
            line = line.strip()
            sents.extend(sent_tokenize(line))
    parser = PlaintextParser.from_string(' '.join(sents), Tokenizer('english'))
    summarizer = LexRankSummarizer()
    # Summarize the document with 2 sentences
    sums = summarizer(parser.document, NUM_SUM_SENTS)
    res_list = []
    for summary in sums:
        res_list.append(str(summary))
    return res_list
Пример #20
0
def get_summary(article):
    url = article
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    parsed_articled = ""

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        parsed_articled += (str(sentence) + "\n")

    return parsed_articled
Пример #21
0
def sumy_sum(filename):
    text = pdf_text(filename)
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    final = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        text = str(sentence)
        text = text.replace('\n', ' ')
        text = text.replace('\r', '')
        text = re.sub(r'[^\x00-\x7f]', r' ', text)
        text = re.sub(r'\s+', r" ", text)
        final += str(text)
    print(final)
Пример #22
0
def get_content(url_name, extras):
    LANGUAGE = "english"
    SENTENCES_COUNT = 5
    url = str(url_name)
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    summarizer = Summarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = extras
    try:
        result = []
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            result.append(str(sentence))
        result = ' '.join(map(str, result))
        return {'result': result, 'source': url}
    except Exception as e:
        print('fail: ', e)
    return extras, sentence
Пример #23
0
def make_summary(language, sentence_count, algorithm, text=None, url=None):
    tokenizer = Tokenizer(language)
    if url:
        parser = HtmlParser.from_url(url, tokenizer)
    else:
        parser = PlaintextParser.from_string(text, tokenizer)
    stemmer = Stemmer(language)

    summarizer = build_summarizer(algorithm, stemmer)
    summarizer.stop_words = stopwords.words(language)

    lines = []
    for sentence in summarizer(parser.document, sentence_count):
        lines.append(str(sentence))

    return '\n'.join(lines)
Пример #24
0
def gen_sum(document, n, alg="LSA"):
    parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if alg == "LSA":
        return run_LSA(stemmer, parser.document)
    elif alg == "LexRank":
        return run_LexRank(stemmer, parser.document)
    elif alg == "TextRank":
        return run_TextRank(stemmer, parser.document)
    elif alg == "Luhn":
        return run_Luhn(stemmer, parser.document)
    elif alg == "SumBasic":
        return run_SumBasic(stemmer, parser.document, n)
    else:
        exit("Unkown extractive summarization algorithm!")
Пример #25
0
    def __init__(self, transcript_file_path, summary_number):
        """ Input a transcript_file_path in the form of a string and a
			summary_number denoting the number of sentences requested in the summary.
		"""
        self.transcript_file = transcript_file_path
        full_transcript_text = file.read(open(self.transcript_file, "r"))
        self.tokenized_transcript = sent_tokenize(full_transcript_text)

        LANGUAGE = "English"
        parser = PlaintextParser.from_file(self.transcript_file,
                                           Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        self.summary = summarizer(parser.document, summary_number)
Пример #26
0
def summarize():
    """ Returns summary of articles """
    text = request.form['text']
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    final = []

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        final.append(str(sentence))
    length = len(final)
    return render_template('paraphrase.html', report=final, length=length)
Пример #27
0
def summarized():
    # Requests data in forms
    URL = request.form['url-field']
    SENTENCES_COUNT = request.form['quantity']
    LANGUAGE = "english"

    # Summarization
    parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    list = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        list.append(sentence)

    return list
Пример #28
0
def main(args=None):
    summarizer, document, items_count, reference_summary = handle_arguments()

    evaluated_sentences = summarizer(document, items_count)
    reference_document = PlaintextParser.from_string(reference_summary,
                                                     Tokenizer(language))
    reference_sentences = reference_document.document.sentences

    for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS:
        if evaluate_document:
            result = evaluate(evaluated_sentences, document.sentences)
        else:
            result = evaluate(evaluated_sentences, reference_sentences)
        print("%s: %f" % (name, result))

    return 0
def summarize_sentences(sentences: str, language="english") -> list:
    # Preparation sentences
    corpus_maker = EnglishCorpus()
    preprocessed_sentences = corpus_maker.preprocessing(sentences)
    preprocessed_sentence_list = corpus_maker.make_sentence_list(
        preprocessed_sentences)
    corpus = corpus_maker.make_corpus()
    parser = PlaintextParser.from_string(" ".join(corpus), Tokenizer(language))

    # Call the summarization algorithm and do the summarization
    summarizer = LexRankSummarizer()
    summarizer.stop_words = get_stop_words(language)
    summary = summarizer(document=parser.document,
                         sentences_count=len(corpus) * 2 // 10)

    return summary
Пример #30
0
def test_document_is_all_in_upper_case():
    """
    When all words is in upper case Plaintext parser first line as heading and
    LexRank algorithm raises exception "ZeroDivisionError: float division by zero"
    because there is no sentence to summarize.
    See https://github.com/miso-belica/sumy/issues/25
    """
    parser = PlaintextParser.from_string(
        "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.",
        Tokenizer("english")
    )
    summarizer = LexRankSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 0
Пример #31
0
    def button_clicked(self):
        self.label.setText("Recording you!")
        self.update()

        LANGUAGE = "english"
        SENTENCES_COUNT = 5  #collects the x number of relevant sentences.

        import speech_recognition as sr
        for index, name in enumerate(sr.Microphone.list_microphone_names()):
            print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name))

        # obtain audio from the microphone
        r = sr.Recognizer()
        commentary = ""

        while True:
            with sr.Microphone(1) as source:
                print("Say something!")
                audio = r.listen(source)
            # recognize speech using Google Speech Recognition
            try:
                speech = r.recognize_google(audio)
                print("You said " + speech) # use default Google API key `r.recognize_google(audio, key = implied)`
                commentary = commentary + speech # + ". "
                if "Corpus" in speech:
                    commentary = commentary - "corpus"
                    break
            except sr.UnknownValueError:
                print("Speech Recognition didn't catch that")
            except sr.RequestError as e:
                print("Could not request results from Speech Recognition service; {0}".format(e))
            except KeyboardInterrupt:
                break

        print("PRE-PUNCTUATION:", commentary, "\n")
        commentary = (fastpunct.punct([commentary], batch_size=32))
        print("PUNCTUATED", commentary, "\n")

        parser = PlaintextParser.from_string(commentary, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        print("SUMMARIZED TEXT:")
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)
Пример #32
0
    def get_summary(self):
        # 1行1文となっているため、改行コードで分離
        sentences = [t for t in self._text.split('\n')]
        for i in range(1):
            print(sentences[i])

        # 形態素解析器を作る
        analyzer = Analyzer(
            [
                UnicodeNormalizeCharFilter(),
                RegexReplaceCharFilter(r'[(\)「」、。]', ' ')
            ],  # ()「」、。は全てスペースに置き換える
            JanomeTokenizer(),
            [
                POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
                ExtractAttributeFilter('base_form')
            ]  # 名詞・形容詞・副詞・動詞の原型のみ
        )

        # 抽出された単語をスペースで連結
        # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。
        corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences]
        """
		for i in range(2):
		    print(corpus[i])
		"""
        # 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。
        # 今年 11 月 SIer Web サービス 会社 転職 する。
        """
		from sumy.parsers.plaintext import PlaintextParser
		from sumy.nlp.tokenizers import Tokenizer
		from sumy.summarizers.lex_rank import LexRankSummarizer
		"""

        # 連結したcorpusを再度tinysegmenterでトークナイズさせる
        parser = PlaintextParser.from_string(''.join(corpus),
                                             Tokenizer('japanese'))

        # LexRankで要約を2文抽出
        summarizer = LexRankSummarizer()
        summarizer.stop_words = [' ']  # スペースも1単語として認識されるため、ストップワードにすることで除外する

        self.summary = summarizer(document=parser.document, sentences_count=2)

        # 元の文を表示
        for sentence in self.summary:
            print(sentences[corpus.index(sentence.__str__())])
Пример #33
0
def summarize():
    rows = store.get_row_by_status(1)

    for row in rows:
        parser = PlaintextParser.from_string(row["content_origin"], Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = list()

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentences.append(str(sentence))

        summary = "\n".join(sentences)

        store.update_row(row["id"], {"summary_origin": summary, "status": 2})
Пример #34
0
def perform_sumy_summarization(data, gold_standard_data):
    stemmer = Stemmer(LANGUAGE)

    summarizers = [LsaSummarizer(stemmer), TextRankSummarizer(stemmer), LexRankSummarizer(stemmer)]
    summarizer_names = ["Lsa", "TextRank", "LexRank"]

    # print "SUMY Scores: "
    # Read each sentence from 'data' and create a summary of it
    for line, gs_line in zip(data, gold_standard_data):
        # Only consider the content part of the text. Changed it from unicode to normal string
        # summarized_text = line["content"].encode('ascii', 'ignore')
        summarized_text = line["content"]
        if gs_line == "\n":
            continue
        gold_standard = gs_line

        # Read line by line instead of reading the entire file
        parser = PlaintextParser.from_string(summarized_text, Tokenizer(LANGUAGE))
        idx = 0
        for summarizer in summarizers:
            # Store the scores in a dictionary
            output_scores[line["index"]] = []
            sumy_dict_key = str(line["index"]) + summarizer_names[idx]
            sumy_scores_dict[sumy_dict_key] = []
            summarizer.stop_words = get_stop_words(LANGUAGE)
            # print "SUMY with", summarizer
            for sentence in summarizer(parser.document, SENTENCES_COUNT):
                if line["index"] in output_scores:
                    output_scores[line["index"]] = []
                if sumy_dict_key in sumy_scores_dict[sumy_dict_key]:
                    continue
                # Store output in a dictionary in the form of a key-value pair
                # Example -->  1: 'with the exception of the elderly and the youth'
                output_scores[int(line["index"])].append({"sumy_rouge_unigrams":
                                        calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 1)})
                output_scores[int(line["index"])].append({"sumy_rouge_bigrams":
                                        calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 2)})
                output_scores[int(line["index"])].append({"sumy_rouge_l":
                                        calculate_rouge_l_score(line["index"], gold_standard, str(sentence))})
                output_scores[int(line["index"])].append({"sumy_rouge_s":
                                        calculate_rouge_s_score(line["index"], gold_standard,str(sentence), 2)})
                sumy_scores_dict[sumy_dict_key].append(calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 1))
                sumy_scores_dict[sumy_dict_key].append(calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 2))
                sumy_scores_dict[sumy_dict_key].append(calculate_rouge_l_score(line["index"], gold_standard, str(sentence)))
                sumy_scores_dict[sumy_dict_key].append(calculate_rouge_s_score(line["index"], gold_standard,str(sentence), 2))
            idx += 1
Пример #35
0
def summarize_sumy(doc, case):
    summary = ""
    file_doc = open("temp.txt", "w", encoding='utf-8')
    file_doc.write(doc)
    file_doc.close()

    parser = PlaintextParser.from_file("temp.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    if case == 1:
        summarizer = LexRankSummarizer(stemmer)
    else:
        summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary += str(sentence) + ' '

    return summary
def summarize(url):
    #url = "https://www.programmableweb.com/api/google-custom-search"

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summary = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary.append(sentence)
        #print(sentence)

    return summary
Пример #37
0
def sumy_summarization(input_text):
    sumy_output = {}
    start_time = time.time()

    parser = PlaintextParser.from_string(input_text, Tokenizer("english"))

    # using Lexrank summarizer that is similair to the spacy approach used above
    lex_summarizer = LexRankSummarizer()

    # Use the top 5 senteences as a sumart
    summarized_sentences = lex_summarizer(parser.document, 5)
    summary_sentences = [str(sentence) for sentence in summarized_sentences]
    sumy_output['summary'] = ' '.join(summary_sentences)

    sumy_output['total_time'] = time.time() - start_time

    return sumy_output
Пример #38
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"), Tokenizer("czech"))
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(
            to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, "
            "což se chlapci ani trochu nelíbilo.")
Пример #39
0
def summarize():
    message = sys.stdin.readlines()
    print(message[0])
    jsonMsg = json.loads(message[0])

    TEXT = jsonMsg[0]
    LANGUAGE = jsonMsg[1]
    SENTENCES_COUNT = 1

    parser = PlaintextParser.from_string(TEXT, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Пример #40
0
def url_summarize():
    if not Validation.validate_api_request(
            request=request, required_properties=["url", "ratio"]):
        return API.api_response(
            API.failure_code,
            "Request does not match defined schema. Check documentation")

    json_content = request.get_json()

    parser = HtmlParser.from_url(url=json_content['url'],
                                 tokenizer=Tokenizer('english'))
    text_from_url = ' '.join(
        str(sentence) for sentence in parser.document.sentences)
    summary = gensim.summarization.summarize(text=text_from_url,
                                             ratio=json_content['ratio'])

    return API.api_response(API.success_code, summary)
Пример #41
0
def test_tokenize_sentences_with_abbreviations():
    tokenizer = Tokenizer("english")
    sentences = tokenizer.to_sentences("There are people who are weird, e.g. normal people. These people know you.")

    expected = ("There are people who are weird, e.g. normal people.", "These people know you.",)
    assert expected == sentences