def summarizeAndBigram(filename, url): # Creating a file object and requesting the html from the link given headers = {'User-Agent': 'Mozilla/5.0'} html = requests.get(url, headers=headers) soup = BeautifulSoup(html.text, 'html5lib') all_paras = soup.find_all("div", {"class": "has-content-area"}) data_2018 = "" for para in all_paras: data_2018 = data_2018 + para.text article_sum = ru.summarize(data_2018) # Print summary gathered above print "Summary of data mining article" print "Three sentence summary" for sent in article_sum['top_n_summary']: print removeUnicode(sent) # Take the data extracted from the site and # create the bigrams based on the datas. print "--------------------" print "Bigrams:" asc_2018 = removeUnicode(data_2018) bigWords = nltk.tokenize.word_tokenize(asc_2018) N = 25 search = nltk.BigramCollocationFinder.from_words(bigWords) search.apply_freq_filter(2) search.apply_word_filter( lambda skips: skips in nltk.corpus.stopwords.words('english')) from nltk import BigramAssocMeasures idxJaccard = BigramAssocMeasures.jaccard bigrams = search.nbest(idxJaccard, N) # Print the bigrams after the filter have been applied for bigram in bigrams: print str(bigram[0]).encode('utf-8'), " ", str( bigram[1]).encode('utf-8') print print
print "Sentiment Summation: %f" % overall_sentiment print "***********Summary*******************" fileObj = codecs.open("DT_Platform.rtf", "w", "UTF") html = requests.get("https://donaldjtrump.com/positions/tax-reform") soup = BeautifulSoup(html.text, "html5lib") all_paras = soup.find_all('p') data_trump = "" for para in all_paras: fileObj.write(para.text) data_trump = data_trump + para.text trump_sum = ru.summarize(data_trump) print "Summary of Trump tax reform: " for sent in trump_sum['top_n_summary']: print removeUnicode(sent) articleAscii = removeUnicode(data_trump) words = [] #num_Co is the number of collocations to find N = 25 #need list of Words by sentence sentences = nltk.tokenize.word_tokenize(articleAscii) for sentence in sentences: for word in nltk.tokenize.word_tokenize(sentence):
def scrapePage(): page = "https://www.google.com/intl/en/policies/technologies/" print "------------------------" print " Page: ", page print "------------------------" html = requests.get(page) soup = BeautifulSoup(html.text, 'html5lib') all_paras = soup.find_all('p') data_2017 = "" for para in all_paras: data_2017 = data_2017 + para.text article_sum = ru.summarize(data_2017) print "------------------------" print " Three Sentence Summary" print "------------------------" for sent in article_sum['top_n_summary']: print removeUnicode(sent) asc_2017 = removeUnicode(data_2017) lstSent = nltk.tokenize.sent_tokenize(asc_2017) sentWords = [nltk.tokenize.word_tokenize(s) for s in lstSent] posWords = [nltk.pos_tag(w) for w in sentWords] posWords = [token for sent in posWords for token in sent] chunkCollector = [] foundChunk = [] lastPos = None for (token, pos) in posWords: if pos == lastPos and pos.startswith('NN'): foundChunk.append(token) elif pos.startswith('NN'): if foundChunk != []: #something in hopper so add to collection chunkCollector.append((''.join(foundChunk), pos)) foundChunk = [token] lastPos = pos dChunk = {} for chunk in chunkCollector: dChunk[chunk] = dChunk.get(chunk, 0) + 1 print "------------------------" print " Most Common Noun Usage" print "------------------------" for (entity, pos) in sorted(dChunk, key=dChunk.get, reverse=True)[:7]: print '\t%s (%s)' % (entity, dChunk[entity, pos]) chunkCollector = [] foundChunk = [] lastPos = None for (token, pos) in posWords: if pos == lastPos and pos.startswith('V'): foundChunk.append(token) elif pos.startswith('V'): if foundChunk != []: #something in hopper so add to collection chunkCollector.append((''.join(foundChunk), pos)) foundChunk = [token] lastPos = pos dChunk = {} for chunk in chunkCollector: dChunk[chunk] = dChunk.get(chunk, 0) + 1 print "------------------------" print " Most Common Verb Usage" print "------------------------" for (entity, pos) in sorted(dChunk, key=dChunk.get, reverse=True)[:7]: print '\t%s (%s)' % (entity, dChunk[entity, pos])
fileObj = codecs.open("17_HO1.rtf", "w", "UTF") html = requests.get("http://swe.umbc.edu/~rayg/econ_plan.html") soup = BeautifulSoup(html.text, 'html5lib') all_paras = soup.find_all('p') # Write test to file and collate it into a str var data_2017 = "" for para in all_paras: fileObj.write(para.text) data_2017 = data_2017 + para.text Iceberg_sum = ru.summarize(data_2017) print "Summary of new iceberg" print "Print Three Sentence Summary" for sentence in Iceberg_sum['top_n_summary']: print removeUnicode(sentence) asc_2017 = removeUnicode(data_2017) bigWords = nltk.tokenize.word_tokenize(asc_2017) N = 25 search = nltk.BigramCollocationFinder.from_words(bigWords) search.apply_freq_filter(2) search.apply_word_filter(lambda skips: skips in nltk.corpus.stopwords.words('English'))
# Create a file to output to and gather html link and beautiful soup object fileObj = codecs.open("proj2.rtf", "w", "UTF") html = requests.get("https://www.ecommercetimes.com/story/52616.html") soup = BeautifulSoup(html.text, 'html5lib') #part3 # Search through all the paragraph tags to ather data and # use russell to summarize the incoming data all_paras = soup.find_all('p') data_2018 = "" for para in all_paras: fileObj.write(para.text) data_2018 = data_2018 + para.text article_sum = ru.summarize(data_2018) # Print summary gathered above print "Summary of data mining article" print "Three sentence summary" for sent in article_sum['top_n_summary']: print removeUnicode(sent) #part4 # Take the data extracted from the site and # create the bigrams based on the datas. print "--------------------" print "Bigrams:" asc_2018 = removeUnicode(data_2018) bigWords = nltk.tokenize.word_tokenize(asc_2018) N = 25
print "Sentiment Summation: %f" % overall_sentiment print "***********Summary*******************" fileObj = codecs.open("DT_Platform.rtf", "w", "UTF") html = requests.get("https://donaldjtrump.com/positions/tax-reform") soup = BeautifulSoup(html.text, "html5lib") all_paras = soup.find_all('p') data_trump = "" for para in all_paras: fileObj.write(para.text) data_trump = data_trump + para.text trump_sum = ru.summarize(data_trump) print "Summary of Trump tax reform: " for sent in trump_sum['top_n_summary']: print removeUnicode(sent) articleAscii = removeUnicode(data_trump) words = [] #num_Co is the number of collocations to find N=25 #need list of Words by sentence sentences = nltk.tokenize.word_tokenize(articleAscii) for sentence in sentences: for word in nltk.tokenize.word_tokenize(sentence):
#Convert to a dictionary and count for each chunk dChunk = {} for chunk in chunkCollector: dChunk[chunk] = dChunk.get(chunk, 0) + 1 print "\nChunking" for (entity, pos) in dChunk: if entity.istitle(): print '\t%s (%s)' % (entity, dChunk[entity, pos]) #Create 3 sentence summary from article text #This portion did not work with the 'russel.pyc' component that was on blackboard so it is being commented out import russell as ru articleSum = ru.summarize(articleText) print "Summary of Article" print "Three Sentence Summary" for each in articleSum['top_n_summary']: print removeUnicode(each) search = nltk.BigramCollocationFinder.from_words(articleAscii) # filter out collocations that do not occur at least 2 times search.apply_freq_filter(2) # Filter out collocations that have stopwords search.apply_word_filter(lambda skip: skip in skips) # We use the Jaccard Index to find our bigrams # idxJaccard = nltk.metrics.BigramAssocMeasures.jaccard
def sentenceSummary(data): # Three sentence summary (#3 of grading rubric) summary = ru.summarize(data) print "----- Three sentence summary of the article -----" for sent in summary['top_n_summary']: print removeUnicode(sent)