示例#1
0
文件: app.py 项目: yevbar/monastery
def add_new_entry():
    import nltk
    nltk.download("punkt")
    from sumy.parsers.html import HtmlParser
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words
    urls = []
    for url in urls:
        LANGUAGE = "english"
        SENTENCES_COUNT = 1
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        my_summary = []
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            my_summary.append(sentence)
        print(my_summary)

        mongo.db.summaries.insert_one({
            "sentence": str(my_summary[0]).split(),
            "url": url
        })

        #print((str(my_summary[0])).split())
    vals = mongo.db["summaries"]
    cursor = vals.find({})
    print({"vals": loads(dumps(cursor))})
示例#2
0
    def get_summary(self, text_source: str, num_sentences: int = 5) -> []:
        # url = "https://www.cbc.ca/news/canada/toronto/skinny-dipping-sharks-ripleys-1.4862945"
        parser = HtmlParser.from_url(text_source, self.Tokenizer)

        doc = parser.document

        return self.Summarizer(doc, num_sentences)
def summarize_article(article, vibe_description_file_path):
    try:
        article_url = article['alternate'][0]['href']
        article_title = article['title']

        article_newspaper = Article(article_url)
        article_newspaper.download()
        article_newspaper.parse()
        article_newspaper.nlp()

        text_content = article_newspaper.text
        update_json_file(vibe_description_file_path, 'textContent',
                         text_content)

        LANGUAGE = 'english'
        parser = HtmlParser.from_url(article_url, Tokenizer('english'))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        article_summary = []
        for sentence in summarizer(parser.document, 3):
            article_summary.append(sentence._text)

    except:
        print('Error summarizing article')
        return False

    update_json_file(vibe_description_file_path, 'summary', article_summary)
    update_json_file(vibe_description_file_path, 'keywords',
                     article_newspaper.keywords)

    return True
示例#4
0
def summarize_url(url,summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
def summarize(url=None, LANGUAGE='English', SENTENCES_COUNT=2):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    result = ''
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        result = result + ' ' + str(sentence)
        try:
            result = result + ' ' + str(sentence)

        except:
            print(
                '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n'
            )
            sys.stdout.flush()
            return (
                '\n\n Invalid Entry!, please Ensure you enter a valid web link \n\n'
            )
    print('\n\n' + str(url) + '\n\n' + str(result))
    sys.stdout.flush()
    return result
示例#6
0
def sum_from_url(url, language="english", sentences_cout=100):
    parser = HtmlParser.from_url(url, Tokenizer(language))
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)
    sentences = summarizer(parser.document, sentences_cout)
    return sentences
示例#7
0
 def get_summary(self, summary_length: int = 10) -> Iterator[str]:
     parser = HtmlParser.from_url(self.link, Tokenizer(LANGUAGE))
     stemmer = Stemmer(LANGUAGE)
     summarizer = Summarizer(stemmer)
     summarizer.stop_words = get_stop_words(LANGUAGE)
     for sentence in summarizer(parser.document, summary_length):
         yield sentence
示例#8
0
    def store_summary(self):
        
        for item in self.doc_id_url:
            if item < len(self.document_info):
                #soup = self.document_info[item]
                s = requests.Session()
                response = s.get(self.doc_id_url[item])
                if response.status_code != 404:
                    parser = HtmlParser.from_url(self.doc_id_url[item], Tokenizer("english"))
                    text = ""
                    """
                    for tag in soup.findAll('p'):
                        text = text + tag.text
                    """
                    stemmer = Stemmer("english")

                    summarizer = Summarizer(stemmer)
                    summarizer.stop_words = get_stop_words("english")
                    
                    for sentence in summarizer(parser.document, 5):
                        print sentence
                        if item in self.summary:
                            self.summary[item] = self.summary[item] + sentence
                        else:
                            self.summary[item] = sentence
示例#9
0
def summarizeFile(inputFile):
	summarizer = LsaSummarizer(stem_word)
	summarizer.stop_words = get_stop_words("english")
	url = findURLS(inputFile)
	if url != None:
		if url[-1] == '.':
			url = url[0:-1]
		#print (url)
		#urlContent = 'Summary from URL ['+url+']: \n'
		urlContent = ''
		try:
			parser = HtmlParser.from_url(url, Tokenizer("english"))		
			for sentence in summarizer(parser.document, 3):
				urlContent = urlContent + str(sentence) + '\n'
		except:
			#print (sys.exc_info()[0])
			urlContent = ''
	content = inputFile.read()
	parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE))
	#summarizer = LsaSummarizer(stem_word)
	#summarizer.stop_words = get_stop_words(LANGUAGE)
	#summary = 'Event Summary: \n'
	summary = ''
	try:
		for sentence in summarizer(parser.document, SENTENCES_COUNT_1):
			summary = summary + str(sentence) + '\n'
	except AssertionError:
		return None
	if url != None:
		return summary + urlContent
	return summary
示例#10
0
    def ExtractivelySummarizeCorpus(self,
                                    corpus_path: str,
                                    HTML: bool = True,
                                    sentence_count: int = 20):

        if (HTML):
            self.parser = HtmlParser.from_url(corpus_path, Tokenizer(LANGUAGE))
        else:
            # or for plain text files
            self.parser = PlaintextParser.from_file(corpus_path,
                                                    Tokenizer(LANGUAGE))

        sentences = self.summarizer(self.parser.document, sentence_count)

        if (DEBUG):
            # logger.info("DEBUG::ExtractivelySummarizeCorpus::these are all the parser.document.sentences")
            # logger.info(self.parser.document.sentences)
            logger.info(
                "DEBUG::ExtractivelySummarizeCorpus::top n=%d sentences:" %
                sentence_count)
            for sentence in sentences:
                logger.info(str(sentence))
        sentences = [str(sentence) for sentence in sentences]

        return sentences
示例#11
0
def get_sentences(url, sentences_count=10):
    """
    Returns the important sentences given a url
    """
    parser = HtmlParser.from_url(url, Tokenizer(language))
    sentences = summarizer(parser.document, sentences_count)
    return sentences
示例#12
0
def get_data_list(URL, file_type=""):
    SUMMARY_SENTENCES_COUNT = 5
    sentences = []
    try:
        LANGUAGE = "english"
        # parser = None
        if file_type == "txt":
            parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE))
        elif file_type == "pdf":
            content = read_pdf(URL)
            parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE))
        else:
            parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE))

        document = parser.document
        stemmer = Stemmer(LANGUAGE)

        from sumy.summarizers.luhn import LuhnSummarizer

        LHS = LuhnSummarizer(stemmer)
        LHS.stop_words = get_stop_words(LANGUAGE)
        print("\nSummary using Luhn Summarizer")
        print("*******************************")
        for sentence in LHS(document, SUMMARY_SENTENCES_COUNT):
            sentences.append(str(sentence))
    except Exception as e:
        print(str(e))
    finally:
        return sentences
示例#13
0
def main(url, num_sentences=10, language='english'):
	parser = HtmlParser.from_url(url, Tokenizer(language))
	stemmer = Stemmer(language)
	summarizer = Summarizer(stemmer)
	summarizer.stop_words = get_stop_words(language)
	for sentence in summarizer(parser.document, num_sentences):
		print(sentence)
示例#14
0
def summarize_url(url, summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
示例#15
0
def summarizer(input_obj, SENTENCES_COUNT=2, op='url'):
    LANGUAGE = "english"
    # SENTENCES_COUNT = 1
    # url =  "https://sea.pcmag.com/smartphones/17424/apple-iphone-x"

    # text = ' '.join(text.split())
    # print(input_obj)
    # print(type(input_obj))
    parser = None
    if op == 'text':
        text = input_obj['text']
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    elif op == 'url':
        url = input_obj['link']
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    else:
        print('OP ERROR')
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        # print(sentence)
        sentences.append(str(sentence))
    return sentences


# print(get_summarize("https://sea.pcmag.com/smartphones/17424/apple-iphone-x"))
def summarizer(request):
    inp=request.POST['geturl']

    LANGUAGE = "english"
    SENTENCES_COUNT = 10

    url = str(inp)

    f = open("denemedosyasiU3.txt", "w")

    f.write(url)

    f.close()

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        return render(request, 'home.html', {'data1': sentence})


    '''
示例#17
0
def url(request):
    if (request.GET.get('url', 'url').lower() not in ['url', 'image']):
        url = request.GET.get('url', 'url')
        print(url)
        LANGUAGE = "english"
        SENTENCES_COUNT = 5
        out = []
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            out.append(str(sentence))
        r = requests.get(url)
        test = url.split("/")
        urlval = str('/'.join(test[:3]))
        data = r.text
        soup = BeautifulSoup(data, "lxml")
        temp = []
        for link in soup.find_all('img'):
            image = link.get("src")
            temp.append(image)
        for loc, i in enumerate(temp):
            if (i[0] == "/"):
                temp[loc] = urlval + temp[loc]
        return ({'content': str("\n".join(out)) + '  '.join(temp)})
示例#18
0
def CreateDataSet(w):
    try:
        urls = url(w) 
        for link in urls:
            if link not in allExtLinks:
                find_about = link
                # Create a list of each bit between slashes
                slashparts = find_about.split('/')
                dirname = '/'.join(slashparts[:-1]) + '/'
                if "about" in slashparts:
                    scrapped_about.append(link)
                    print('\n',link)

                    for about in  scrapped_about:   
                        parser = HtmlParser.from_url(link, Tokenizer("english"))
                        summary = summarizer(parser.document, 2)
        #                 print(l, '\n')
                        #saving the summary to a dataframe
                        for sentence in summary:        
                            print(sentence, '\n')
                        break 
        else:
            print('There are no "about" linked pages in this url')
    except:
        print('there seem to be an issue with the Url you entered')
        quit()
示例#19
0
def sumySummary(url):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    return [
        cleanText(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)
    ]
def get_summ(url, func=Summarizer3):
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = func(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sumy = summarizer(parser.document, SENTENCES_COUNT)
    result = [str(i) for i in list(sumy)]
    return result
示例#21
0
def get_summary(url):
    parser = HtmlParser.from_url(url, Tokenizer('English'))
    stemmer = Stemmer('English')
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words('English')

    print "\nThree Sentence Summary:\n"
    for sentence in summarizer(parser.document, 3):
        print sentence
示例#22
0
    def summarize_url(self, url, sentences=3, language="english"):
        parser = HtmlParser.from_url(url, Tokenizer(language))
        stemmer = Stemmer(language)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(language)

        text = " ".join(map(str, summarizer(parser.document, sentences)))
        return " ".join(text.split())
示例#23
0
def summarize(url):
    summary = []
    parser = HtmlParser.from_url(url,Tokenizer(lang))
    stemmer = Stemmer(lang)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    for sentence in summarizer(parser.document,sent):
        summary.append(sentence._text)
    return ' '.join(summary)
示例#24
0
def getSentencesFrom(url):
    lang = "english"
    try:
        parser = HtmlParser.from_url(url, Tokenizer(lang))
    except:
        print("HTTP ERROR @ " + url)
    sentences = list(parser.document.sentences)
    sentences = map(unicode, sentences)
    return sentences
示例#25
0
def main():
    url = "http://www.spiegel.de/international/europe/as-brexit-nears-harrassment-of-eu-citizens-in-uk-rises-a-1181845.html"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    run_LSA(stemmer, parser.document)
    run_LexRank(stemmer, parser.document)
    run_TextRank(stemmer, parser.document)
    run_Luhn(stemmer, parser.document)
    run_SumBasic(stemmer, parser.document)
示例#26
0
def urlDoc_summarize(url):
    parser = HtmlParser.from_url(url, Tokenizer('english'))
    stemmer = Stemmer('english')
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words('english')
    summary = ''
    for sentense in summarizer(parser.document, 15):
        summary += str(sentense) + ' '
    return summary
示例#27
0
    def summCallback(self, url2open):
        parser = HtmlParser.from_url(url2open, Tokenizer("english"))
        stemmer = Stemmer("english")

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words("english")

        self.area.delete("0.0", END)
        for sentence in summarizer(parser.document, 10):
            self.area.insert(END, sentence)
示例#28
0
def retreive_sumy(url):
    # "http://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    return summarizer(parser.document, SENTENCES_COUNT)
示例#29
0
def retreive_sumy(url):
    # "http://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)	
    return summarizer(parser.document, SENTENCES_COUNT)
示例#30
0
 def do_stuff():
     summary_final = ""
     parser = HtmlParser.from_url(url, Tokenizer(Config.sumy_lang))
     stemmer = Stemmer(Config.sumy_lang)
     summarizer = Summarizer(stemmer)
     summarizer.stop_words = get_stop_words(Config.sumy_lang)
     for sentence in summarizer(parser.document,
                                Config.sumy_num_sentences):
         summary_final = summary_final + " " + str(sentence)
     return summary_final
示例#31
0
def summarize(url, sent_count=10):
    """Automatic text summarizer
    https://pypi.python.org/pypi/sumy
    """
    lang = "english"
    parser = HtmlParser.from_url(url, Tokenizer(lang))
    stemmer = Stemmer(lang)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(lang)
    summary = [str(sent) for sent in summarizer(parser.document, sent_count)]
    return (summary)
示例#32
0
 def summarizeUrl(self, url, numSentences = 10):
     """Summarizes text at a given url to numSentences
     """
     #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG))
     parser = HtmlParser.from_url(url, Tokenizer(self.LANG))        
     stemmer = Stemmer(self.LANG)
     summarizer = SumySummarizer(stemmer)
     summarizer.stop_words = get_stop_words(self.LANG)
     summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)])
     return summary
     
示例#33
0
def summarize_url(url):
    try:
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    except:
        return ["" * SENTENCES_COUNT]
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    outs = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        outs.append(str(sentence))
    return outs
示例#34
0
def summarize(url,sent_len = SENTENCES_COUNT):
    #url = "https://en.wikipedia.org/wiki/Automatic_summarization"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)


    return {'summary':[str(sentence) for sentence in summarizer(parser.document, sent_len)]}
示例#35
0
def readSum(first, second, third):
    #response = requests.get(url+"play?fname="+str(first)+str(second)+str(third))
    parser = HtmlParser.from_url(
        url + "play?fname=" + str(first) + str(second) + str(third),
        Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    msg = ""
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        msg = msg + sentence.__str__()
    return statement(msg)
示例#36
0
    def getText(self, sentence_count=None):
        if sentence_count:
            self.SENTENCE_COUNT = sentence_count
        parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE))
        # or for plain text files
        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        stemmer = Stemmer(self.LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        text_list = []

        for sentence  in summarizer(parser.document, self.SENTENCE_COUNT):
            text_list.append(str(sentence))
        return "\n".join(text_list)
示例#37
0
def index():
    # url = "http://www.dawn.com/news/1216282"
    # -------------------------------------------------------------------------------
    # -------  Need help here ------------------#
    if request.method == 'POST':
         url = request.json.get('url')
         line_count = request.json.get('line_count')
    # ---------------------------------------------------------------------------

         parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
         print (parser)
    # stemmer = Stemmer(LANGUAGE)
    #
    # summarizer = Summarizer(stemmer)
    # summarizer.stop_words = get_stop_words(LANGUAGE)

    # s = ""
    # for sentence in summarizer(parser.document, SENTENCES_COUNT):
    #     s += str(sentence)

    return jsonify(dict(message='stuff'))
示例#38
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# LANGUAGE = "english"
LANGUAGE = "czech"

SENTENCES_COUNT = 10

if __name__ == "__main__":
    # parser = PlaintextParser.from_file("yelp1.txt", Tokenizer(LANGUAGE))
    url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
示例#39
0
def summarize(url, sent_count=10):
    "Produces `sent_cout` sentence summaries of `url`."
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    return " ".join([str(sentence) for sentence
                     in summarizer(parser.document, sent_count)])
示例#40
0
def getSummaryFromWebsite(url, sentences_count):

    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    return summarize(parser, sentences_count)
示例#41
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import lxml.html


list_of_pages = ['http://postach.us10.list-manage1.com/track/click?u=819841bd24897de296a130d94&id=1fbd285a11&e=01afa4fcef']

stemmer = Stemmer('English')
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words('English')

if __name__ == "__main__":
  for url in list_of_pages:
      parser = HtmlParser.from_url(url, Tokenizer('English'))
      print(lxml.html.parse(url).find(".//title").text)
      print(url),
      for sentence in summarizer(parser.document, 2):
          print(sentence),
示例#42
0
def analyze_web_site(url):
    print("Main Points: %s \n" % url)
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    analyze(parser)