Пример #1
0
def test_single_sentence():
    document = build_document(("Já jsem jedna věta",))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("já", "jsem",)

    returned = summarizer(document, 10)
    assert len(returned) == 1
Пример #2
0
def models_LUHN_LEX_LSA_2(dataframe):
    LANGUAGE = "english"
    stop = get_stop_words(LANGUAGE)
    size = len(dataframe)
    stemmer = Stemmer(LANGUAGE)

    for i in range(0, size):
        article = dataframe.loc[i, "post_content"]

        parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE))

        summarizerLUHN = LUHN(stemmer)
        summarizerLUHN.stop_words = stop

        summarizerLEX = LEX(stemmer)
        summarizerLEX.stop_words = stop

        summarizerLSA = LSA(stemmer)
        summarizerLSA.stop_words = stop

        LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence
        LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence
        LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence

        for sentence1 in LUHNsentence:
            LUHNsummary = sentence1
        for sentence2 in LEXsentence:
            LEXsummary = sentence2
        for sentence3 in LSAsentence:
            LSAsummary = sentence3

        dataframe.loc[i, "LUHN"] = LUHNsummary
        dataframe.loc[i, "LEX"] = LEXsummary
        dataframe.loc[i, "LSA"] = LSAsummary
Пример #3
0
    def test_single_sentence(self):
        document = build_document(("Já jsem jedna věta",))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("já", "jsem",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 1)
Пример #4
0
def summarize(final_transcript, askuser=False):
    print('Summarizing transcript...')
    parser = PlaintextParser.from_file(final_transcript, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    if askuser == True:
        summtype = input('Summarizer type? [1: Luhn, 2: Lex-Rank, 3: Text-Rank] ')
    else:
        summtype = SUMMMETHOD
    
    if summtype == '1':
        summarizer = LuhnSummarizer(stemmer)
        typename = 'luhn'
    elif summtype == '2':
        summarizer = LexRankSummarizer(stemmer)
        typename = 'lex'
    elif summtype == '3':
        summarizer = TextRankSummarizer(stemmer)
        typename = 'tex'
    
    summarizer.stop_words = get_stop_words(LANGUAGE)
    count = SENTENCES_COUNT
    summaryfile = str(final_transcript)[:-4] + '_summ_' + typename + '.txt'
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentence_out = str(SENTENCES_COUNT - count + 1) + ':\n' + str(sentence) + '\n--------------\n'
        with open(summaryfile, 'a') as f:
            f.write(sentence_out)
        print(sentence_out)
        count -= 1
    return summaryfile
Пример #5
0
def test_various_words_with_significant_percentage():
    document = build_document((
        "1 a",
        "2 b b",
        "3 c c c",
        "4 d d d",
        "5 z z z z",
        "6 e e e e e",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "6 e e e e e",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "5 z z z z",
        "6 e e e e e",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "3 c c c",
        "5 z z z z",
        "6 e e e e e",
    ]
Пример #6
0
def test_three_sentences():
    document = build_document((
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("s",)

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]
Пример #7
0
def test_three_sentences():
    document = build_document((
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("s",)

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "wa s s s wa s s s wa",
        "wb s wb s wb s s s s s s s s s wb",
        "wc s s wc s s wc",
    ]
Пример #8
0
def summarize_url(url, summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
Пример #9
0
def get_data_list(URL, file_type=""):
    SUMMARY_SENTENCES_COUNT = 5
    sentences = []
    try:
        LANGUAGE = "english"
        # parser = None
        if file_type == "txt":
            parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE))
        elif file_type == "pdf":
            content = read_pdf(URL)
            parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE))
        else:
            parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))

        document = parser.document
        stemmer = Stemmer(LANGUAGE)

        from sumy.summarizers.luhn import LuhnSummarizer

        LHS = LuhnSummarizer(stemmer)
        LHS.stop_words = get_stop_words(LANGUAGE)
        print("\nSummary using Luhn Summarizer")
        print("*******************************")
        for sentence in LHS(document, SUMMARY_SENTENCES_COUNT):
            sentences.append(str(sentence))
    except Exception as e:
        print(str(e))
    finally:
        return sentences
Пример #10
0
def summarize_url(url,summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
Пример #11
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Пример #12
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s", )

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
Пример #13
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Пример #14
0
def test_various_words_with_significant_percentage():
    document = build_document((
        "1 a",
        "2 b b",
        "3 c c c",
        "4 d d d",
        "5 z z z z",
        "6 e e e e e",
    ))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

    returned = summarizer(document, 1)
    assert list(map(to_unicode, returned)) == [
        "6 e e e e e",
    ]

    returned = summarizer(document, 2)
    assert list(map(to_unicode, returned)) == [
        "5 z z z z",
        "6 e e e e e",
    ]

    returned = summarizer(document, 3)
    assert list(map(to_unicode, returned)) == [
        "3 c c c",
        "5 z z z z",
        "6 e e e e e",
    ]
Пример #15
0
def luhn(parser,sentence_count):
    summarizer_1 = LuhnSummarizer(Stemmer(language))
    summarizer_1.stop_words = get_stop_words(language)
    summary_1 = summarizer_1(parser.document, sentence_count)
    temp = ''
    for sentence in summary_1:
        temp = temp + str(sentence)
    return (temp)
Пример #16
0
def luhn_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = LuhnSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
Пример #17
0
    def test_two_sentences(self):
        document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("já", "jsem", "a", "ta",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
Пример #18
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
def luhn_summarizer(text, stemmer, LANGUAGE, SENTENCES_COUNT):
    parser = PlaintextParser.from_string(text, sumytoken(LANGUAGE))
    summarizer_luhn = LuhnSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(LANGUAGE)
    sentences = []
    for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT):
        a = sentence
        sentences.append(str(a))
    return " ".join(sentences)
Пример #20
0
    def luhn(self,text_parser):
        assert isinstance(text_parser,plaintext.PlaintextParser)

        summarizer=Luhn()
        #EnglishStemmer())
        #summarizer.stop_words=stopwords.words("english")

        summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE)
        return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
Пример #21
0
def test_two_sentences():
    document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("já", "jsem", "a", "ta",)

    returned = summarizer(document, 10)
    assert list(map(to_unicode, returned)) == [
        "Já jsem 1. věta",
        "A já ta 2. vítězná výhra",
    ]
Пример #22
0
def test_two_sentences():
    document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
    summarizer = LuhnSummarizer()
    summarizer.stop_words = ("já", "jsem", "a", "ta",)

    returned = summarizer(document, 10)
    assert list(map(to_unicode, returned)) == [
        "Já jsem 1. věta",
        "A já ta 2. vítězná výhra",
    ]
Пример #23
0
def sumy_luhn_summarizer(docx):
    parser = PlaintextParser.from_string(docx, Tokenizer("english"))
    luhn_summarizer = LuhnSummarizer()
    luhn_summarizer = LuhnSummarizer(Stemmer("english"))
    luhn_summarizer.stop_words = get_stop_words("english")
    #Summarize the document with 2 sentences
    summary = luhn_summarizer(parser.document, 2)
    summary_list = [str(sentence) for sentence in summary]
    result = ' '.join(summary_list)
    return result
Пример #24
0
def _get_summary(document):
    parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = STOP_WORDS

    summary = " "
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        summary += " ".join(sentence.words)

    return summary
Пример #25
0
    def summarize_text(text):
        language = "english"

        parser = PlaintextParser.from_string(text, Tokenizer(language))

        summarizer = LuhnSummarizer(Stemmer(language))
        summarizer.stop_words = sumy.utils.get_stop_words(language)
        summary_text = ""
        for sentence in summarizer(parser.document, 5):
            summary_text += str(sentence) + " "

        return summary_text
Пример #26
0
def searchGoogle(querystring):
    # to do -> handle exceptions, re-query on google if there is an exception by going to the next link and same for pdf and ppt
    num_page = 1
    linkno = 0
    while (True):
        # infinite loop to search for the answer for querystring until it is found
        try:
            print(
                '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
            )
            print("QUERY --> " + str(querystring))
            # print(querystring)

            searchresult = google.search(querystring, num_page)

            searchlink = searchresult[
                linkno].link  # this is the first link of the google search results...we will always go to the first link
            print("Search Link --> " + str(searchlink))

            if searchlink[-4:] == '.pdf' or searchlink[-4:] == '.ppt':
                # go to next link id the current link is a ppt or pdf
                print("Can't include ppts or pdfs, trying next link on Google")
                linkno += 1
                if linkno > 9:
                    # if number of links on one page have been exceede, go to the next google link page
                    num_page += 1
                    linkno = 0
            else:
                LANGUAGE = "english"
                SENTENCES_COUNT = 10

                parser = HtmlParser.from_url(searchlink, Tokenizer(LANGUAGE))

                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Summarisation using Luhn Summarizer
                stopwords1 = set(stopwords.words('english'))

                datastring = ''

                # using the LuhnSummarizer
                summarizer = LuhnSummarizer()
                summarizer.stop_words = stopwords1
                for sentence in summarizer(parser.document, SENTENCES_COUNT):
                    # print(sentence)
                    datastring += str(sentence)

                return datastring
        except:
            linkno += 1
            if linkno > 9:
                # if number of links on one page have been exceede, go to the next google link page
                num_page += 1
                linkno = 0
Пример #27
0
def luhn_summarizer(data):
    text = data
    parser = PlaintextParser.from_string((text), sumytoken(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)    
    print ("\n","*"*30, "LUHN SUMMARIZER", "*"*30)
    summarizer_luhn = LuhnSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(LANGUAGE)
    result = ''
    for sentence in summarizer_luhn(parser.document, SENTENCES_COUNT):
        result += str(sentence)
    
    return result
Пример #28
0
def luhnReferenceSummary(path):
    sentencesList = []
    parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = LuhnSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        #print(sentence._text)
        sentencesList.append(sentence._text)

    return sentencesList
Пример #29
0
def luhnReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = LuhnSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
Пример #30
0
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method):
    actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5
    parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english"))
    stemmer = Stemmer("english")
    if (Summarize_Method == "Gensim"):
        #ratio: define length of the summary as a proportion of the text
        temp = summarize(Audio_Text, ratio=0.5)
        sen = sent_tokenize(temp)
        sen = Counter(sen)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LexRankSummarizer"):
        # Using LexRank(Sentence based ranking based on repeating sentences)
        summarizer_Lex = LexRankSummarizer(stemmer)
        summarizer_Lex.stop_words = get_stop_words("english")
        #Summarize the document with 2 sentences
        summary = summarizer_Lex(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LuhnSummarizer"):
        # Using LUHN(Sentence based on frequency of most important words)
        summarizer_luhn = LuhnSummarizer(stemmer)
        summarizer_luhn.stop_words = get_stop_words("english")
        summary_1 = summarizer_luhn(parser.document, actual_sentences_count)
        sen = Counter(summary_1)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "LsaSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_lsa2 = LsaSummarizer()
        summarizer_lsa2 = LsaSummarizer(stemmer)
        summarizer_lsa2.stop_words = get_stop_words("english")
        summary = summarizer_lsa2(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
    elif (Summarize_Method == "TextRankSummarizer"):
        # Using LSA(Sentence based on frequency of most important words)
        summarizer_text = TextRankSummarizer()
        summarizer_text = TextRankSummarizer(stemmer)
        summarizer_text.stop_words = get_stop_words("english")
        summary = summarizer_text(parser.document, actual_sentences_count)
        sen = Counter(summary)
        temp = sen.most_common(sentences_count)
        for value in temp:
            print(value[0])
Пример #31
0
def Luhn(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = LuhnSummarizer(stemmer)  # Luhn算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
Пример #32
0
    def test_two_sentences_but_one_winner(self):
        document = build_document(
            ("Já jsem 1. vítězná ta věta", "A já ta 2. vítězná věta"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = (
            "já",
            "jsem",
            "a",
            "ta",
        )

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "A já ta 2. vítězná věta")
Пример #33
0
def webBrowse():
    SENTENCES_COUNT = numOfSent.get()
    parser = HtmlParser.from_url(url.get(), Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizerurl(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    outputFile = open("C://Users//rakesh chandra//Desktop//ATS//outputU.txt",
                      'w')
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        outputFile.write("->  ")
        outputFile.write(str(sentence))
        outputFile.write("\n \n")
    os.startfile("C://Users//rakesh chandra//Desktop//ATS//outputU.txt")
Пример #34
0
def get_content(url_name, extras):
    LANGUAGE = "english"
    SENTENCES_COUNT = 5
    url = str(url_name)
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    summarizer = Summarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = extras
    try:
        result = []
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            result.append(str(sentence))
        result = ' '.join(map(str, result))
        return {'result': result, 'source': url}
    except Exception as e:
        print('fail: ', e)
    return extras, sentence
Пример #35
0
def test_real_example():
    parser = PlaintextParser.from_string(
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
        "Přerostly až v reparát z jazyka na konci školního roku. "
        "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
        "o rok mladších dětí budoval vedoucí pozici. "
        "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
        Tokenizer("czech"))
    summarizer = LuhnSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("czech")

    returned = summarizer(parser.document, 2)
    assert list(map(to_unicode, returned)) == [
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.",
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.",
    ]
Пример #36
0
def models_LUHN_LEX_LSA(article):
    ##    Candidate models:
    #        Bag of Words
    #        FastText
    #        word2vec
    #        LDA (topic extraction)
    #        skip-thoughts
    #        doc2vec
    #        LSTM

    LANGUAGE = "english"
    stop = get_stop_words(LANGUAGE)
    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE))

    result = []

    summarizerLUHN = LUHN(stemmer)
    summarizerLUHN.stop_words = stop

    summarizerLEX = LEX(stemmer)
    summarizerLEX.stop_words = stop

    summarizerLSA = LSA(stemmer)
    summarizerLSA.stop_words = stop

    LUHNsentence = summarizerLUHN(parser.document,
                                  1)  #summarize document with one sentence
    LEXsentence = summarizerLEX(parser.document,
                                1)  #summarize document with one sentence
    LSAsentence = summarizerLSA(parser.document,
                                1)  #summarize document with one sentence

    for sentence in LUHNsentence:
        LUHNsummary = sentence
    for sentence in LEXsentence:
        LEXsummary = sentence
    for sentence in LSAsentence:
        LSAsummary = sentence

    result.append(LUHNsummary)
    result.append(LEXsummary)
    result.append(LSAsummary)

    return result
Пример #37
0
def test_real_example():
    parser = PlaintextParser.from_string(
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
        "Přerostly až v reparát z jazyka na konci školního roku. "
        "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
        "o rok mladších dětí budoval vedoucí pozici. "
        "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
        Tokenizer("czech")
    )
    summarizer = LuhnSummarizer(stem_word)
    summarizer.stop_words = get_stop_words("czech")

    returned = summarizer(parser.document, 2)
    assert list(map(to_unicode, returned)) == [
        "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.",
        "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě o rok mladších dětí budoval vedoucí pozici.",
    ]
Пример #38
0
def simple_check():
    SUMMARY_SENTENCES_COUNT = 5
    LANGUAGE = "english"
    URL = "https://qz.com/1367800/ubernomics-is-ubers-semi-secret-internal-economics-department/"
    parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))
    document = parser.document
    stemmer = Stemmer(LANGUAGE)

    from sumy.summarizers.luhn import LuhnSummarizer

    LHS = LuhnSummarizer(stemmer)
    LHS.stop_words = get_stop_words(LANGUAGE)
    print("\nSummary using Luhn Summarizer")
    print("*******************************")
    for sentence in LHS(document, SUMMARY_SENTENCES_COUNT):
        print(sentence)

    html = urllib.request.urlopen(URL).read()
    soup = BeautifulSoup(html, features='html.parser')
    print(soup.prettify())
Пример #39
0
def main(req: func.HttpRequest) -> func.HttpResponse:
    ret = ""
    logging.info('Python HTTP trigger function processed a request.')
    text = str(req.get_body())

    soup = BeautifulSoup(text, features="lxml")
    souped = soup.get_text()

    SENTENCES_COUNT = math.log2(souped.count('.'))
    
    parser = PlaintextParser.from_string(souped, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        ret += str(sentence)
    
    return func.HttpResponse(re.sub(r'\\\w{3}','',ret))
Пример #40
0
def get_summary_per_section_luhn(cur_sents, each_summ_num):
    summarizer = LuhnSummarizer()
    summarizer = LsaSummarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = (
        "I",
        "am",
        "the",
        "you",
        "are",
        "me",
        "is",
        "than",
        "that",
        "this",
    )
    parser = PlaintextParser(cur_sents, Tokenizer(LANGUAGE))
    summ = summarizer(parser.document, each_summ_num)
    decoded = []
    for line in summ:
        decoded.append(line._text)
    return decoded
Пример #41
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s",)

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
Пример #42
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import sys


LANGUAGE = "english"
SENTENCES_COUNT = int(sys.argv[2])
text_file = sys.argv[1]


if __name__ == "__main__":
    
    parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = LuhnSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)