def summarize(text):
    if isvalid(text):
        all_capital = False
        # to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on
        if text.upper() == text:
            text = text.lower()
            all_capital = True

        if PY2:
            parser = PlaintextParser.from_string(
                text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE))
        else:
            parser = PlaintextParser.from_string(
                text.encode().decode('ascii', errors='ignore'),
                Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = [
            str(s)
            for s in summarizer(parser.document, sentences_count=n_sentences)
        ]

        if all_capital:
            output_sentences = ' '.join(sentences).upper()
            all_capital = False
        else:
            output_sentences = ' '.join(sentences)

        return output_sentences
    else:
        return ''
示例#2
0
def sum_from_string(string, language="english", sentences_cout=100):
    parser = PlaintextParser.from_string(string, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)
    sentences = summarizer(parser.document, sentences_cout)
    return sentences
示例#3
0
def get_doc_summary(html, url):
    '''
    Parse document text and extract summary with summarization 
    algorithms. This is helpful when meta-desc tag is not available
    '''
    from sumy.parsers.html import HtmlParser
    # from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    LANGUAGE = "english"
    SENTENCES_COUNT = 3

    parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    res = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        res += str(sentence)
    return res
示例#4
0
def textSummary(data, SENTENCES_COUNT):
    LANGUAGE = "english"
    parser = PlaintextParser.from_string(data, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    x = ''
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        x += ' {}'.format(str(sentence))
    return x
示例#5
0
def summarize(url, sent_count=10):
    """Automatic text summarizer
    https://pypi.python.org/pypi/sumy
    """
    lang = "english"
    parser = HtmlParser.from_url(url, Tokenizer(lang))
    stemmer = Stemmer(lang)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    summary = [str(sent) for sent in summarizer(parser.document, sent_count)]
    return (summary)
示例#6
0
def summarize(text):

    parser = PlaintextParser.from_string(text.decode(
        'ascii', errors='ignore'), Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = [str(s) for s in summarizer(
        parser.document, sentences_count=n_sentences)]
    return ' '.join(sentences)
示例#7
0
def run_summarizer(parser, sentences, language='english'):
    """
    :params parser: Parser for selected document type
    :params sentences: Maximum sentences for summarizer.

    :returns summary: Summarized page.
    """

    summarizer = Summarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    return [
        str(sentence) for sentence in summarizer(parser.document, sentences)
    ]
示例#8
0
def summarizer(parser, sentences, language='english'):
    """
    :params parser: Parser for selected document type
    :params sentences: Maximum sentences for summarizer.

    :returns summary: Summarized page.
    """
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)

    summarizer.stop_words = get_stop_words(language)

    output = [
        str(sentence) for sentence in summarizer(parser.document, sentences)
    ]

    return ' '.join(output)
示例#9
0
def apply(text, interest, top_k=5):
    """
    Return and tuple of list of tuple.
        The first list contains all the sentence about the interest and there corresponding weigth in the document
        The second list contains the top_k sentences of the document
    """
    LANGUAGE = "english"
    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sent_importance = summarizer.rate_sentences(parser.document)
    interesting_sent = []
    for sent in sent_importance:
        if interest.lower() in sent._text.lower():
            interesting_sent.append((sent._text, sent_importance[sent]))
    top_sent = summarizer(parser.document, top_k)
    top_sent = [(s._text, sent_importance[s]) for s in top_sent]
    return (interesting_sent, top_sent)
def main():
    LANGUAGE = "english"
    SENTENCES_COUNT = 2
    stop = set(stopwords.words('english'))

    #retrieve each of the articles
    articles = os.listdir("../data/articles")
    count = 0
    for article in articles:
        stdout.write("\rProgress: {:02.0f}%".format(
            float(count) / len(articles) * 100))
        stdout.flush()
        # print 'Reading articles/' + article
        # articleFile = io.open('articles/' + article, 'r')
        parser = PlaintextParser.from_file(
            os.path.abspath(os.path.join("../data/articles", article)),
            Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        summary = ""
        file_name = os.path.splitext(article)[0].split('.')[0]
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary += str(sentence)

        summary_tokens = [
            token.lower().translate(None, punctuation)
            for token in word_tokenize(summary) if token not in punctuation
            and token.lower() not in stop and token != "'s"
        ]

        with open(os.path.join("results", file_name + ".txt"),
                  "w") as keywords_file:
            keywords_file.write('\n'.join(set(summary_tokens)))

        count += 1

    print "\nDone..."
示例#11
0
def summarize():

    final = []

    # Checking the integrity of the url query
    url = request.args.get('url')

    if (url == None or url == ""):
        return abort(400)

    # Checking the integrity of the num query
    try:
        num = int(request.args.get('num'))

        num = MIN_SENTENCES_COUNT if num < MIN_SENTENCES_COUNT else num
        num = MAX_SENTENCES_COUNT if num > MAX_SENTENCES_COUNT else num

    except (ValueError, TypeError) as e:
        num = MIN_SENTENCES_COUNT

    # Handles error where url is not a valid url
    try:
        parser = Parser.from_url(url, Tokenizer(LANGUAGE))
    except (requests.exceptions.MissingSchema,
            requests.exceptions.HTTPError) as e:
        try:
            parser = Parser.from_url("http://" + url, Tokenizer(LANGUAGE))
        except:
            return "URL is not valid.", 403

    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    # Take each sentence and append
    for sentence in summarizer(parser.document, num):
        # unidecode takes unicode characters and converts it into ASCII
        final.append(unidecode(str(sentence)))

    return json.dumps({"title": parser.get_title(), "content": final})
示例#12
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import  TextRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import nltk
#nltk.download()

LANGUAGE = "english"
SENTENCES_COUNT = 1


if __name__ == "__main__":
    url = "http://www.businessinsider.in/Heres-a-super-quick-guide-to-what-traders-are-talking-about-right-now/articleshow/59387381.cms"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print (parser.document)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print (sentence)