Exemplo n.º 1
0
def get_summary(url, max_sent, language='english'):
    tokenizer = Tokenizer(language)
    parser, meta = get_parser(url, tokenizer)
    summary = run_summarizer(parser, max_sent, language)
    return dict(summary=summary, url=url, meta=meta, max_sent=max_sent)
Exemplo n.º 2
0
def lexranker(text, count):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer = LexRankSummarizer()
	summary = summarizer(parser.document, count)

	return summary
Exemplo n.º 3
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


LANGUAGE = "english"
SENTENCES_COUNT = 10


if __name__ == "__main__":
    # or for plain text files
    parser = PlaintextParser.from_file("C:\\Users\\Administrator\\Desktop\\myfolder\\sea-and-adventures\\the-old-man-and-the-sea-3.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Exemplo n.º 4
0
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        #print("LSA"+sentence._text)
    #print("LSA ends here")
        sentencesList.append(sentence._text)

    return sentencesList


if __name__ == "__main__":
    writer = open('output2.txt', 'w+')

    for root, dirs, files in os.walk(sys.argv[1]):
        for i in range(0, len(files)):
            filename = root + '/' + files[i]
            parser = PlaintextParser.from_file(filename, Tokenizer(LANGUAGE))
            stemmer = Stemmer(LANGUAGE)

            suma4 = " "

            suma1=lexrankReferenceSummary(filename)
            suma2 = textrankReferenceSummary(filename)
            suma3= lsarankReferenceSummary(filename)
            suma4=textrank.summary_main(filename)

            #keywords_ex = textsum.getKeywords(filename)

            #print(keywords_ex)

            writer.write("Filename= "+filename+"\n")
            writer.write('Reference Summary'+ "\n")
Exemplo n.º 5
0
def luhner(text, count):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer_luhn = LuhnSummarizer()
	summary_1 =summarizer_luhn(parser.document, count)

	return summary_1
Exemplo n.º 6
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 10

if __name__ == "__main__":
    parser = PlaintextParser.from_file("dataaaa_text.txt'",
                                       Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
def sumySummarize(filename, language="english", num_sents=1):
    """
    Luhn's algorithm is the most basic:
    1. Ignore Stopwords
    2. Determine Top Words: The most often occuring words in the document are counted up.
    3. Select Top Words: A small number of the top words are selected to be used for scoring.
    4. Select Top Sentences: Sentences are scored according to how many of the top words they 
    contain. The top N sentences are selected for the summary.
    
    SumBasic uses a simple concept:
    1. get word prob. p(wi) = ni/N (ni = no. of times word w exists, N is total no. of words)
    2. get sentence score sj = sum_{wi in sj} p(wi)/|wi| (|wi| = no. of times wi comes in sj)
    3. choose sj with highest score
    4. update pnew(wi) = pold(wi)^2 for words in the chosen sentence (we want probability to include the same words to go down)
    5. repeat until you reach desired no. of sentences
    
    KL algorithm solves arg min_{S} KL(PD || PS) s.t. len(S) <= # sentences, where 
    	KL = Kullback-Lieber divergence = sum_{w} PD(w)log(PD(w)/PS(w))
    	PD = unigram word distribution of the entire document
    	PS = unigram word distribution of the summary (optimization variable)
    
    LexRank and TextRank use a PageRank kind of algorithm
    1. Treat each sentence as the node in the graph
    2. Connect all sentences to get a complete graph (a clique basically)
    3. Find similarity between si and sj to get weight Mij of the edge conecting i and j
    4. Solve the eigen value problem Mp = p for similarity matrix M.
    5. L = 0.15 + 0.85*Mp.  L gives the final score for each sentence.  Pick the top sentences
    LexRank uses a tf-idf modified cosine similarity for M.  TextRank uses some other similarity metric
    
    LSA uses a SVD based approach
    1. Get the term-sentence matrix A (rows is terms, columns is sentences).  Normalize with term-frequency (tf) only
    2. Do SVD; A = USV' (A=m x n, U=m x n, S=n x n, V=n x n)
    SVD derives the latent semantic structure of sentences.  The k dimensional sub-space get the key k topics
    of the entire text structure.  It's a mapping from n-dimensions to k
    If a word combination pattern is salient and recurring in document, this
    pattern will be captured and represented by one of the singular vectors. The magnitude of the
    corresponding singular value indicates the importance degree of this pattern within the
    document. Any sentences containing this word combination pattern will be projected along
    this singular vector, and the sentence that best represents this pattern will have the largest
    index value with this vector. As each particular word combination pattern describes a certain
    topic/concept in the document, the facts described above naturally lead to the hypothesis that
    each singular vector represents a salient topic/concept of the document, and the magnitude of
    its corresponding singular value represents the degree of importance of the salient
    topic/concept.
    Based on this, summarization can be based on matrix V.  V describes an importance degree 
    of each topic in each sentence. It means that the k’th sentence we choose has the largest 
    index value in k’th right singular vector in matrix V.  An extension of this is using 
    SV' as the score for each sentence
    """
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words
    from sumy.summarizers.luhn import LuhnSummarizer
    from sumy.summarizers.lsa import LsaSummarizer
    from sumy.summarizers.text_rank import TextRankSummarizer
    from sumy.summarizers.lex_rank import LexRankSummarizer
    from sumy.summarizers.sum_basic import SumBasicSummarizer
    from sumy.summarizers.kl import KLSummarizer

    parser = PlaintextParser.from_file(filename, Tokenizer(language))

    def getSummary(sumyAlgorithm):
        sumyAlgorithm.stop_words = get_stop_words(language)
        summary = sumyAlgorithm(parser.document, num_sents)
        sents = " ".join([str(sentence) for sentence in summary])
        return sents

    stemmer = Stemmer(language)

    summaries = {}
    summaries['Luhn'] = getSummary(LuhnSummarizer(stemmer))
    summaries['LSA'] = getSummary(LsaSummarizer(stemmer))
    summaries['TextRank'] = getSummary(TextRankSummarizer(stemmer))
    summaries['LexRank'] = getSummary(LexRankSummarizer(stemmer))
    summaries['SumBasic'] = getSummary(SumBasicSummarizer(stemmer))
    summaries['KL'] = getSummary(KLSummarizer(stemmer))

    print("")
    print("####### From Sumy #######")
    print(summaries)
Exemplo n.º 8
0
import pandas as pd
from rouge import Rouge
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer

#%%
### Extractive Summarizers ###

#%%
document1 = """Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics."""
# %%
# For Strings
parser = PlaintextParser.from_string(document1, Tokenizer("english"))

# %%
# Using LexRank
lex_summarizer = LexRankSummarizer()
summary = lex_summarizer(parser.document,
                         2)  # Summarize the document with 2 sentences
for sentence in summary:
    print(sentence)

# %%
# Using Luhn
luhn_summarizer = LuhnSummarizer()
summary_1 = luhn_summarizer(parser.document, 2)
for sentence in summary_1:
    print(sentence)
Exemplo n.º 9
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = 'english'
SENTENCES_COUNT = 4

parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)

print("\n===== Luhn =====")
summarizerLuhn = LuhnSummarizer(stemmer)
summarizerLuhn.stop_words = get_stop_words(LANGUAGE)
for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT):
    print(sentenceLuhn, "\n")

print("\n===== TextRank =====")
summarizerTR = TextRankSummarizer(stemmer)
summarizerTR.stop_words = get_stop_words(LANGUAGE)
for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT):
    print(sentenceTR, "\n")

print("\n===== LSA =====")
Exemplo n.º 10
0
def func(file1, username, wc):
    Summary = ""
    packet = BytesIO()
    packet.seek(0)

    filename = "/" + file1
    #print(user)
    config = {
        "apiKey": "AIzaSyDvTZQo3KQIWvDmMwP16ItJ_DaJEylIGrc",
        "authDomain": "fir-android-c7a0d.firebaseapp.com",
        "databaseURL": "https://fir-android-c7a0d.firebaseio.com",
        "storageBucket": "fir-android-c7a0d.appspot.com"
    }
    firebase = pyrebase.initialize_app(config)
    stor = firebase.storage()
    #os.remove("T3.pdf")
    stor.child(filename).download("T3.pdf")
    pdf_document = "T3.pdf"
    doc = fitz.open(pdf_document)
    page_Count = doc.pageCount
    for v in range(0, int(page_Count)):
        page1 = doc.loadPage(v)
        pageText = page1.getText("text")
        # Get text from StringIO
        text = pageText
        text1 = ""
        text3 = ""
        count = 0
        r = len(text)
        for i in range(1, r - 1):
            if text[i] == " " and text[i + 1] == " ":
                text.replace(text[i], "")
                count += 1

            if text[i] == '\t' or text[i] == '\n':
                text.replace(text[i], " ")
                count += 1
            r = len(text)
        t = 0
        i = 0
        j = 0
        k = 0
        flag1 = 0
        for i in range(t, len(text)):
            if text[i] == '.':
                for j in range(i + 1, len(text)):
                    if text[j] == '.':
                        text1 = text[i:j]
                        for k in text1:
                            flag1 = 0
                            if k in {':', '!', '-', '(', ')'}:
                                flag1 = 1
                                break
                        if flag1 == 1:
                            break
                        break
                if flag1 == 1:
                    continue
                else:
                    text3 = text3 + text[i:j]
            t = j
        r = 0
        for i in range(0, r - 1):
            if text3[i] == '.' and text3[i + 1] != ' ':
                text3 = text3.replace(text3[i + 1], '')
            r = len(text3)
        w_count = int(wc)
        W_Count = 0
        if w_count == 0:
            w_count = 50
        else:
            W_Count = w_count
        counters = 0
        for p in text3:
            if p == " ":
                counters += 1
        if counters < 20:
            continue
        #out=summarize(text3,ratio=(W_Count*.01))
        LANGUAGE = "english"
        SENTENCES_COUNT = W_Count
        parser = PlaintextParser.from_string(text3, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        out = ""
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            out += str(sentence)
        if out == "":
            out = "Not enough words in this page to summarize"
        Summary = Summary + "\n\n Page No : " + str(v + 1) + "\n\n"
        Summary = Summary + " " + out
    lengther = 0
    for i in Summary:
        lengther += 1
        if i == '.':
            break
    out1 = Summary[lengther:len(Summary)]
    out = "Summary\n\n\n Page No: 1\n\n" + out1

    outfile = 'final.txt'
    with open(outfile, "w+") as filer:
        filer.write(out)
    filer.close()
    bucket = storage.bucket()

    blob = bucket.blob(str(username) + '/' + 'final.txt')
    blob.upload_from_filename(outfile)
    #os.remove(outfile)

    return out
Exemplo n.º 11
0
def get_summary(textss, truereq, numofsent):
    output_sentences = []
    hold = ''
    truecount = 0
    store = ''
    store = keywords(
        textss, ratio=0.05)  #extracting the most relevant words from full text
    store1 = str(store)
    holdfirst = nltk.word_tokenize(
        store1)  #storing the tokenized string (keywords) to remove punctuation
    parser = PlaintextParser.from_string(
        textss, Tokenizer(LANGUAGE))  #storing the full text into an object
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sentencess = []
    compare = []
    TEMP_FOLDER = tempfile.gettempdir()
    documents = sent_tokenize(textss)  #storing sentences of full text
    summalen = len(documents)  #storing the number of sentences
    stoplist = set('for a of the and to in'.split())

    for sentence in summarizer(parser.document, numofsent):
        hold = str(sentence)
        ttt = nltk.word_tokenize(hold)
        count = 0
        for i in range(0, len(ttt)):
            for j in range(0, len(holdfirst)):
                if ttt[i] == holdfirst[j]:
                    count += 1
        compare.append(count)
        sentencess.append(str(sentence))

    texts = [
        [word for word in document.lower().split() if word not in stoplist]
        for document in documents
    ]  #storing an array of sentences where each sentence is a list of words without stopwords
    frequency = defaultdict(
        int
    )  #storing a subclass that calls a factory function to supply missing values

    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts
             ]  #storing an array of words that occur more than once

    dictionary = corpora.Dictionary(texts)  #storing a map of words
    dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict'))
    new_doc = str(textss.encode(
        'utf-8'))  #storing the utf-8 version of textss (original)
    new_vec = dictionary.doc2bow(
        new_doc.lower().split()
    )  #converting the utf-8 econded textss into a bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded).

    corpus = [
        dictionary.doc2bow(text) for text in texts
    ]  #applying doc2bow to texts(list of  words that occur more than once) save into an array
    corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'),
                               corpus)
    dictionary = corpora.Dictionary.load(
        os.path.join(TEMP_FOLDER, 'deerwester.dict'))
    corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm'))
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    doc = str(textss.encode('utf-8'))
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lsi = lsi[vec_bow]  #converting the query to LSI space
    index = similarities.MatrixSimilarity(lsi[corpus])
    index.save(os.path.join(TEMP_FOLDER, 'deerwester.index'))
    index = similarities.MatrixSimilarity.load(
        os.path.join(TEMP_FOLDER, 'deerwester.index'))
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    newlist = []

    for i in range(0, summalen):
        newlist.append(documents[sims[i][0]])
        if i == 4:
            break

    for sentencez in newlist:
        hold = str(sentencez)
        ttt = nltk.word_tokenize(hold)
        count = 0

        for i in range(0, len(ttt)):
            for j in range(0, len(holdfirst)):
                if ttt[i] == holdfirst[j]:
                    count += 1
        compare.append(count)
        sentencess.append(str(sentencez))
    i = 0
    while i < truereq:
        holdsubs = []
        indexes = compare.index(max(compare))
        doc1 = nlp(u'%s' % str(sentencess[indexes]))
        parse = doc1
        for word in parse:
            if word.dep_ == 'nsubj':
                holdsubs.append(word.text.lower())
        if holdsubs:
            if holdsubs[0] != 'they' and holdsubs[0] != 'their' and holdsubs[
                    0] != 'both' and holdsubs[0] != 'these' and holdsubs[
                        0] != 'this':
                countcomma = str(sentencess[indexes]).count(',')
                if countcomma < 7:
                    output_sentences.append(sentencess[indexes])
                    i += 1
        del sentencess[indexes]
        del compare[indexes]
    return output_sentences
Exemplo n.º 12
0
def clicked():
    file = open('testfile.txt', 'a')
    #website to text file as testfile.txt
    html = requests.get(url1.get()).content
    #1 Recoding
    unicode_str = html.decode("utf8")
    encoded_str = unicode_str.encode("ascii", 'ignore')
    news_soup = BeautifulSoup(encoded_str, "html.parser")
    title = news_soup.find_all('h1')
    z = [re.sub(r'<.+?>', r'', str(b)) for b in title]
    s1 = ''.join(z) + '.' + '\n'
    file.write(s1)

    #finding the summary of text file and again store it into testfile.txt
    LANGUAGE = "english"
    SENTENCES_COUNT = 10

    if __name__ == "__main__":

        url = url1.get()

        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))

        print("--LuhnSummarizer--")
        summarizer = LuhnSummarizer()
        summarizer = LsaSummarizer(Stemmer(LANGUAGE))
        summarizer.stop_words = (
            "I",
            "am",
            "the",
            "you",
            "are",
            "me",
            "is",
            "than",
            "that",
            "this",
        )
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            str1 = str(sentence)
            file.write(str1)
        file.close()

    #open the text file and divide it into 10 parts as 0 to 9.txt
    str1 = open('testfile.txt', 'r').read()
    #print(str1)
    l = str1.split(".")
    i = len(l)
    for j in range(8):
        file = open('text/' + str(j) + '.txt', 'a')
        s0 = ''.join(l[j])
        file.write(s0)

    def _patch_faulty_function(self):
        if self.token_key is not None:
            return self.token_key
        timestamp = calendar.timegm(time.gmtime())
        hours = int(math.floor(timestamp / 3600))

        response = requests.get("https://translate.google.com/")
        line = response.text.split('\n')[-1]
        parsed = re.search("(?:TKK='(?:(\d+)\.(\d+))';)", line)
        a, b = parsed.groups()
        result = str(hours) + "." + str(int(a) + int(b))
        self.token_key = result
        return result

    # Monkey patch faulty function.
    Token._get_token_key = _patch_faulty_function

    # Then call it normally.
    #with open('testfile.txt', 'r') as myfile:
    #   data=myfile.readlines()

    for k in range(8):
        str1 = open('text/' + str(k) + '.txt', 'r').read()
        #print(str1)
        #str1 = "my name is khan"
        if (len(str1) != 0):
            tts = gTTS(str1)
            tts.save('voice/' + str(k) + '.mp3')

    keyword = open('text/0.txt', 'r').read()
    #print(keyword)
    st = 'googleimagesdownload --keywords "' + keyword + '" --limit 8'

    os.system(st)
    os.system("D:/VideoBeta/VideoBeta.exe")
Exemplo n.º 13
0
def summarize(request):
    """Responds to any HTTP request.
    Args:
        request (flask.Request): HTTP request object.
    Returns:
        The response text or any set of values that can be turned into a
        Response object using
        `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
    """
    # request_json = request.get_json()
    # if request.args and 'message' in request.args:
    #     return request.args.get('message')
    # elif request_json and 'message' in request_json:
    #     return request_json['message']
    # else:
    #     return f'Hello World!'
    try:
        if request.method == 'OPTIONS':
            # Allows GET requests from any origin with the Content-Type
            # header and caches preflight response for an 3600s
            headers = {
                'Access-Control-Allow-Origin': '*',
                'Access-Control-Allow-Methods':
                'GET, POST, PUT, PATCH, DELETE, OPTIONS',
                'Access-Control-Allow-Headers':
                'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization',
                'Access-Control-Expose-Headers':
                'Content-Length,Content-Range',
                'Access-Control-Max-Age': '3600'
            }
            return ('', 204, headers)

        headers = {
            'Access-Control-Allow-Origin': '*',
        }
        request_json = request.get_json()
        document = request_json['value']
    except:  #for local try using py main.py
        headers = None
        document = request['value']
    finally:

        parser = PlaintextParser.from_string(document, Tokenizer("english"))

        summaries = {}
        number_pool = [0, 1, 2, 3]
        random.shuffle(number_pool)
        print(number_pool)

        for i in range(len(number_pool)):
            if number_pool[i] == 0:
                summarizer = LexRankSummarizer()
            if number_pool[i] == 1:
                summarizer = LuhnSummarizer()
            if number_pool[i] == 2:
                summarizer = LsaSummarizer(Stemmer("english"))
                summarizer.stop_words = get_stop_words("english")
            if number_pool[i] == 3:
                summarizer = PureNLTKSummarizer()

            summary = summarizer(parser.document, 3)
            sum_string = []
            for sentence in summary:
                sum_string.append(str(sentence))
            summaries[f'{i}'] = " ".join(sum_string)
        if headers is None:
            return summaries
        return (summaries, 200, headers)
Exemplo n.º 14
0
# -*- coding: utf8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

from os.path import dirname, join, abspath
from sumy.nlp.tokenizers import Tokenizer
from sumy._compat import to_string, to_unicode
from sumy.models.dom import ObjectDocumentModel, Paragraph, Sentence

_TOKENIZER = Tokenizer("czech")


def expand_resource_path(path):
    return join(abspath(dirname(__file__)), to_string("data"), to_string(path))


def load_resource(path):
    path = expand_resource_path(path)
    with open(path, "rb") as file:
        return to_unicode(file.read())


def build_document(*sets_of_sentences):
    paragraphs = []
    for sentences in sets_of_sentences:
Exemplo n.º 15
0
def test_most_frequent_terms_empty():
    tokenizer = Tokenizer("english")
    model = TfDocumentModel("", tokenizer)

    assert model.most_frequent_terms() == ()
    assert model.most_frequent_terms(10) == ()
Exemplo n.º 16
0
def txt_summary(doc, sentences_num):
    parser = PlaintextParser.from_string(doc, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, sentences_num)
    return summary
Exemplo n.º 17
0
def test_most_frequent_terms_negative_count():
    tokenizer = Tokenizer("english")
    model = TfDocumentModel("text", tokenizer)

    with pytest.raises(ValueError):
        model.most_frequent_terms(-1)
Exemplo n.º 18
0
analyzer = Analyzer(
    [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(
        r'[(\)「」、。]', ' ')],  # ()「」、。は全てスペースに置き換える
    JanomeTokenizer(),
    [POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter(
        'base_form')]  # 名詞・形容詞・副詞・動詞の原型のみ
)

# 抽出された単語をスペースで連結
# 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。
corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences]
for i in range(2):
    print(corpus[i])
# 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。
# 今年 11 月 SIer Web サービス 会社 転職 する。


# 連結したcorpusを再度tinysegmenterでトークナイズさせる
parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese'))

# LexRankで要約を2文抽出
summarizer = LexRankSummarizer()
summarizer.stop_words = [' ']  # スペースも1単語として認識されるため、ストップワードにすることで除外する

summary = summarizer(document=parser.document, sentences_count=2)

print('要約しました')
# 元の文を表示
for sentence in summary:
    print(sentences[corpus.index(sentence.__str__())])
Exemplo n.º 19
0
    all_scores = defaultdict(dict)

    i = 0

    final_documents = {}
    for bill in data:
        i += 1
        if i % 50 == 0:
            print(i)

        summary = bill['clean_summary']
        doc = bill['clean_text']
        bill_id = bill['bill_id']

        doc2 = PlaintextParser(doc, Tokenizer(LANGUAGE)).document
        for name, Summarizer in ALL_SUMMARIZERS:
            try:
                summarizer = Summarizer(stemmer)
                #summarizer.stop_words = get_stop_words(LANGUAGE)

                # Score all sentences -- then keep up to 2000 char
                total_sentences = len(doc2.sentences)
                sent_scores = summarizer(doc2, total_sentences)
                sent_scores = [(str(s.sentence), s.rating)
                               for s in sent_scores]

                # Pick best set with greedy

                summary_len = 2000
                final_sents = greedy_summarize(*zip(*sent_scores),
 def summarize_from_text(self, text):
     parser = PlaintextParser.from_string(text, Tokenizer(self.__language))
     return self.__summarize(parser)
Exemplo n.º 21
0
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "japanese"
SENTENCES_COUNT = 3

text = """乗客106人が死亡し、562人が重軽傷を負った兵庫県尼崎市のJR福知山線脱線事故は25日、発生から15年となったが、今年は新型コロナウイルス感染拡大の影響で、JR西日本主催の追悼慰霊式が初めて開かれなかった。現場を訪れることを断念した遺族や負傷者らは、それぞれの場所で祈りをささげた。

 次男の昌毅(まさき)さん=当時(18)=を亡くした上田弘志さん(65)は体調を崩したことから自宅で昌毅さんの冥福を祈り、「現場に行けなくてごめんな」と頭を下げたという。

 ただ、うれしい報告もできた。3日前、三男の篤史(あつし)さん(30)に長女の陽菜(ひな)ちゃんが誕生したことだ。「こんなに前向きになれた25日は初めて」と上田さん。昌毅さんに「来年は陽菜を抱っこして現場に行くね」と約束したといい、「『じいちゃん、頑張れ』って返してくれた」とほほえんだ。

 3両目に乗車し、顔や足に重傷を負った玉置富美子さん(70)も体調を考慮し、現場を訪れなかった。だが、「現場はたくさんの命が犠牲となり、自分にとっては人生が百八十度変わった場所。行くことで重みを感じられる」とした上で「式典がなかったら思い出す機会もなくなってしまう。JR西はこの日を忘れないように引き継いでほしい」と訴えた。


 2両目に乗っていた次女(34)が重傷を負った三井ハルコさん(64)は、これまで家族で追悼慰霊式に参加した後、現場近くで事故の風化防止を願う栞(しおり)を配布してきたが、今年は自宅に。「今日は15年間で初めて静かに祈りの時間を過ごすことができた」と話した。"""

if __name__ == "__main__":
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Exemplo n.º 22
0
from modules.sql import dBAdapter
from modules.pre import create_corpus as c
from nltk import sent_tokenize

n_documents = 4

#----------------------------------------------------------------------------
print("Getting body subtitles from the database started ...")
dbAdapter = dBAdapter.Database()
dbAdapter.open()
dic_subtitles = dict(dbAdapter.selectDic_subtitles_limit(n_documents))
dbAdapter.close()
print("finalizada consulta")

string = sent_tokenize(list(dic_subtitles.values())[0])

from sumy.parsers.plaintext import PlaintextParser
#for tokenization
from sumy.nlp.tokenizers import Tokenizer

parser = PlaintextParser.from_string(
    list(dic_subtitles.values())[0], Tokenizer("spanish"))

from sumy.summarizers.lsa import LsaSummarizer
summarizer_2 = LsaSummarizer()
summary_2 = summarizer_2(parser.document, 10)
summ_list = []
for sentence in summary_2:
    summ_list.append(sentence._text)
summ_text = " ".join(summ_list)
cur.execute(qry)
questions=cur.fetchall()
for q in questions:
    print(q.VerbatimQuestionID)
    qry="select sentence_norm from [CPE].[t_Sentence_Tokenized4_H2FY16SMSPManaged1] where verbatimQuestionID=%s"%(q.VerbatimQuestionID)
    cur.execute(qry)
    rows=cur.fetchall()
    txt=open("C:\\Users\\kach\\OneDrive\\iWorks\\Python\\Text Summarization\\SQL2TXT.txt","w")
    for row in rows:
        row=''.join(map(str,row))
        outputstring=''.join([row,"\n"])
        txt.write(outputstring)
    txt.close()

    file="C:\\Users\\kach\\OneDrive\\iWorks\\Python\\Text Summarization\\SQL2TXT.txt"
    parser = PlaintextParser.from_file(file, Tokenizer("english"))
    summarizer = LexRankSummarizer()

    summary = summarizer(parser.document, 5) #Summarize the document with 5 sentences

    sumx=""
    for sentence in summary:
        sumx=sumx+str(sentence)+"\n"
    cur.execute("""INSERT INTO [CPE].[t_Sentence_Tokenized4_H2FY16SMSPManaged1_TxtSummary] VALUES(?,?)""",(q.VerbatimQuestionID,sumx))

con.commit()
con.close()

print("Completed!")

Exemplo n.º 24
0
from sumy.nlp.stemmers import Stemmer
from sumy.nlp.tokenizers import Tokenizer

from src.Enums.SummarizerEnums import SummarizerType
from src.Summarizers.SumySummarizer import SumySummarizer

stemmer = Stemmer('english')
tokenizer = Tokenizer('english')

lsa = SumySummarizer(summarizerType=SummarizerType.LSA)
ed = SumySummarizer(summarizerType=SummarizerType.Edmundson)
lex = SumySummarizer(summarizerType=SummarizerType.LexRank)
rand = SumySummarizer(summarizerType=SummarizerType.Random)

url = "https://www.cbc.ca/news/canada/toronto/skinny-dipping-sharks-ripleys-1.4862945"
url2 = "https://www.bbc.com/news/business-45986510"

results = {'lsa': lsa.get_summary(url2),
           'ed': ed.get_summary(url2),
           'lex': lex.get_summary(url2),
           'rand': rand.get_summary(url2)}

print(results)
Exemplo n.º 25
0
def lsaer(text, count):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer_lsa = LsaSummarizer()
	summary_2 =summarizer_lsa(parser.document, count)

	return summary_2
    def gaz(doc_str, many):

        nlp = spacy.load('en')

        # doc_str = doc_str.replace("\\", "")

        #### Summary:

        ### Summaries
        import sumy

        from sumy.summarizers.lex_rank import LexRankSummarizer
        from sumy.summarizers.text_rank import TextRankSummarizer

        from sumy.parsers.plaintext import PlaintextParser
        from sumy.nlp.tokenizers import Tokenizer
        from sumy.nlp.stemmers import Stemmer
        from sumy.utils import get_stop_words

        lexi = LexRankSummarizer(Stemmer("english"))
        texi = TextRankSummarizer(Stemmer("english"))

        parser = PlaintextParser.from_string(doc_str, Tokenizer("english"))

        texi = TextRankSummarizer(Stemmer("english"))

        rentence = "dddd"
        for sentence in texi(parser.document, 20):  # This does indeed summarise the document


            if (str(rentence).split()[len(str(rentence).split()) - 1][-1] == ".") and (len(rentence) > 2):
                rentence = rentence + " " + str(sentence)
            elif len(rentence) < 3:
                rentence = rentence + " " + str(sentence)
            else:
                rentence = rentence + ". " + str(sentence)

        stop_words = set(stopwords.words('english'))
        stop_words.update(['.', ',', '"', "'", '?', '!', '! !', ':', ';', '(', ')', '[', ']', '{',
                           '}'])  # remove it if you need punctuation

        list_of_words = [i.lower() for i in wordpunct_tokenize(doc_str) if i.lower() not in stop_words]

        final = ' '.join(list_of_words)

        from nltk.tokenize import RegexpTokenizer

        tokenizer = RegexpTokenizer(r'\w+')
        list_of_words = tokenizer.tokenize(final)
        final = ' '.join(list_of_words)

        parsed_review = nlp(final)

        # print(parsed_review)

        token_text = [token.orth_ for token in parsed_review]
        token_pos = [token.pos_ for token in parsed_review]

        df = pd.DataFrame({'token_text': token_text, 'part_of_speech': token_pos})

        # Unigrams
        import nltk
        from nltk import word_tokenize
        from nltk.util import ngrams
        from collections import Counter

        token = nltk.word_tokenize(str(parsed_review))
        grams = ngrams(token, many)

        dra = Counter(grams)

        t = pd.DataFrame()

        f = pd.DataFrame(list(dra.keys()))

        if many == 2:
            f[0] = f[0] + " " + f[1]

        if many == 3:
            f[0] = f[0] + " " + f[1] + " " + f[2]

        f = f[0]

        t["name"] = f
        t["count"] = list(dra.values())

        df = df.drop_duplicates()
        r = pd.merge(t, df, left_on=["name"], right_on=["token_text"], how="left", right_index=False)
        r = r.drop("token_text", axis=1)
        r.columns = ["name", "count", "pos"]

        scaler = MinMaxScaler()
        r["norm"] = scaler.fit_transform(r["count"].values.reshape(-1, 1))

        if many == 1:
            dfs = r[r["pos"] == "NOUN"].sort_values("count", ascending=False)
        else:
            dfs = r.sort_values("count", ascending=False)

        return dfs, rentence
Exemplo n.º 27
0
def summarization(id):
    summarizer = LexRankSummarizer()
    """Summarization and Factors influnce for POSITIVE feedbacks"""
    pos_query = Feedback.query.filter_by(sentiment='POSITIVE').filter_by(
        session=id).all()
    neg_query = Feedback.query.filter_by(sentiment='NEGATIVE').filter_by(
        session=id).all()
    if len(pos_query) == 0 and len(neg_query) == 0:
        return "0"
    else:
        pos_text = ""
        for i in range(len(pos_query)):
            pos_text = pos_text + str(pos_query[i].description)

        cleaned_pos_text = pos_text.lower().translate(
            str.maketrans('', '', string.punctuation))
        tokenized_pos_words = word_tokenize(cleaned_pos_text, "english")
        final_pos_words = []
        for word in tokenized_pos_words:
            if word not in stopwords.words('english'):
                final_pos_words.append(word)
        """Counting Factors for POSITIVE"""
        w = Counter(final_pos_words)
        a = {}
        for x in List_of_factor:
            if x in w.keys():
                a[x] = w[x]
        pos_fact = sorted(a.items(), key=lambda x: x[1], reverse=True)
        """Summary of POSITIVE"""
        parser = PlaintextParser.from_string(pos_text, Tokenizer("english"))
        summ_Pos = ""
        abstract_pos = summarizer(parser.document, 1)
        for sentence in abstract_pos:
            summ_Pos = summ_Pos + str(sentence)
        """Summarization and Factors influnce for NEGATIVE feedbacks"""
        neg_text = ""
        for i1 in range(len(neg_query)):
            neg_text = neg_text + str(neg_query[i1].description)

        cleaned_neg_text = neg_text.lower().translate(
            str.maketrans('', '', string.punctuation))
        tokenized_neg_words = word_tokenize(cleaned_neg_text, "english")
        final_neg_words = []
        for word in tokenized_neg_words:
            if word not in stopwords.words('english'):
                final_neg_words.append(word)
        """Counting Factors for NEGATIVE"""
        w = Counter(final_neg_words)
        b = {}
        for x in List_of_factor:
            if x in w.keys():
                b[x] = w[x]
        neg_fact = sorted(b.items(), key=lambda x: x[1], reverse=True)
        """Summary of NEGATIVE"""
        parser = PlaintextParser.from_string(neg_text, Tokenizer("english"))
        summ_Neg = " "
        abstract_neg = summarizer(parser.document, 1)
        for sentence in abstract_neg:
            summ_Neg = summ_Neg + str(sentence)

        return {
            'cnt_pos': pos_fact[0:5],
            'cnt_neg': neg_fact[0:5],
            'summ_pos': summ_Pos,
            'summ_neg': summ_Neg
        }
Exemplo n.º 28
0
def test_magnitude():
    tokenizer = Tokenizer("english")
    text = "wA wB wC wD"
    model = TfDocumentModel(text, tokenizer)

    assert model.magnitude == pytest.approx(2.0)
Exemplo n.º 29
0
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

analyser = SentimentIntensityAnalyzer()
LANGUAGE = "english"
SENTENCES_COUNT = 4

url = "https://en.wikipedia.org/wiki/Automatic_summarization"
parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

stemmer = Stemmer(LANGUAGE)

summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)
Exemplo n.º 30
0
def testSummarize():
    txt='''
        "As complexity rises , precise statements lose meaning and meaningful statements lose precision . ( Albert Einstein ) .", 
        "Fuzzy logic deals with reasoning that is approximate rather than fixed and exact . This may make the reasoning more meaningful for a human :", 
        "", 
        "", 
        "I 've written a short introduction to fuzzy logic that goes into a bit more details but should be very accessible .", 
        "Fuzzy logic seems to have multiple of applications historically in Automotive Engineering .", 
        "I found an interesting article on the subject from 1997 . This excerpt provides an interesting rationale :", 
        "Here are some papers and patents for automatic transmission control in motor vehicles . One of them is fairly recent :", 
        "Automatic Transmission Shift Schedule Control Using Fuzzy Logic SOURCE : Society of Automotive Engineers , 1993", 
        "Fuzzy Logic in Automatic Transmission Control SOURCE : International Journal of Vehicle Mechanics and Mobility , 2007", 
        "Fuzzy control system for automatic transmission | Patent | 1987", 
        "Transmission control with a fuzzy logic controller | Patent | 1992", 
        "", 
        "Likewise with fuzzy logic anti-lock breaking systems ( ABS ) :", 
        "Antilock-Braking System and Vehicle Speed Estimation using Fuzzy Logic SOURCE : FuzzyTECH , 1996", 
        "Fuzzy Logic Anti-Lock Break System SOURCE : International Journal of Scientific & Engineering Research , 2012", 
        "Fuzzy controller for anti-skid brake systems | Patent | 1993", 
        "", 
        "This method seems to have been extended to aviation :", 
        "A Fuzzy Logic Control Synthesis for an Airplane Antilock-Breaking System SOURCE : Proceedings of the Romanian Academy , 2004", 
        "Landing gear method and apparatus for braking and maneuvering | Patent | 2003", 
        ""
    '''
    texts=[]
    for p in txt.split("\n"):
        texts.append("<p>"+p+"</p>")
    txt=" ".join(texts)
    from sumy.parsers.html import HtmlParser
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    #from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words
    #from sumy.summarizers.kl import KLSummarizer as Summarizer
    from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
    from programmingalpha.Utility.TextPreprocessing import PreprocessPostContent
    from textblob import TextBlob
    LANGUAGE = "english"

    pros=PreprocessPostContent()
    #url = "https://github.com/miso-belica/sumy"
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    texts=pros.getPlainTxt(txt)
    #print(TextBlob(txt).sentences)
    print(len(texts))
    [print("#p=>",p) for p in texts]
    SENTENCES_COUNT = len(texts)

    document=[]
    for i in range(len(texts)):
        document.append(texts[i])
        document.append("")
    document="\n".join(document)
    print(document)
    parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)