def extract_clean_content(content): global __version__ # I found out about goose and readability here: # http://stackoverflow.com/questions/14164350/identifying-large-bodies-of-text-via-beautifulsoup-or-other-python-based-extract # The poster seems to like goose more. # One difference is that goose cleans up all the html, while readability # usually just remove cruft that isn't related to the article text. # There is a trade off between retaining links and formatting, and # getting cleaner text. # Readability seems to be better at finding the content in some cases # so it is used for initial cleaning, then goose is used since its # plain text output is easier to deal with downstream. method = None cleaned_content = '' ###### Readability code: readability_error = None try: document = readability.readability.Document(content) cleaner_content = document.summary().strip() if len(cleaner_content) > 50: content = cleaner_content else: readability_error = "Readability content too short: " + cleaner_content except readability.readability.Unparseable as e: readability_error = '\n'.join([str(i) for i in sys.exc_info()]) except (lxml.etree.XMLSyntaxError, lxml.etree.DocumentInvalid, lxml.etree.ParserError) as e: readability_error = '\n'.join([str(i) for i in sys.exc_info()]) except (AttributeError, ValueError, TypeError) as e: # This ought to be handled by readability. readability_error = '\n'.join([str(i) for i in sys.exc_info()]) ###### if not content.startswith('<html>'): content = '<html><body>' + content + '</body></html>' try: cleaned_content = goose.Goose({ 'parser_class': 'soup', 'enable_image_fetching': False, }).extract(raw_html=content).cleaned_text except ValueError: cleaned_content = '' if len(cleaned_content) < 1: # Goose doesn't do well with foreign language content. # If we can't find content with goose try extracting # all the text with Beautiful soup. # Beautiful soup doesn't attempt to extract the article, # it just finds all the text in the html, which seems to be # good enough since we've already used readability on the articles. content = re.sub('\<br\s?\/?\>', '\n', content) cleaned_content = BeautifulSoup(content).text return { 'clearnerVersion': __version__, 'method': method, 'content': cleaned_content, 'readability_error': readability_error, # Malformed should be true whenever we can detect an issue with the # content that was extracted. 'malformed': len(cleaned_content) < 50 }
def parse(url): page = requests.get(url) #, proxies = proxies) g = goose.Goose() #article = g.extract(url=url) article = g.extract(raw_html=page.text) items = {} items['headline'] = article.title items['text'] = ArticleExtractor.filter_unicode(article.cleaned_text) return items
def goosefy(content, article): cleaned_text = '' if len(content.strip()) > 0: cleaned_text = goose.Goose({'enable_image_fetching': False, 'use_meta_language': False, 'target_language': 'pt'}).extract(raw_html=content).cleaned_text if len(cleaned_text) == 0: if article.has_key('summary'): cleaned_text = article['summary'] return cleaned_text
def parse_article(url): g = goose.Goose() a = g.extract(url) return { 'url': url, 'author': ','.join(a.authors), 'date': str(dateparser.parse(a.publish_date or '')), 'text': a.cleaned_text.replace('. ', '.\n '), 'title': a.title, 'links': a.links, 'topics': parse_text(a.cleaned_text), 'html': tostring(a.doc), 'domain': get_domain(url), 'raw_html': a.raw_html }
# -*- coding: utf-8 -*- """ @Time: 2019/6/21 13:54 """ import time import os import sys import urllib2 import goose from goose.text import StopWordsChinese url = "http://www.sohu.com/a/299667318_501931" opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(url) raw_html = response.read() g = goose.Goose({'stopwords_class': StopWordsChinese}) a = g.extract(raw_html=raw_html) print(a.infos) print(a.title) print("--"*20) print(a.cleaned_text)
print 'The server couldn\'t fulfill the request.' print 'Error code: ', e.code else: print 'URL is good!' #============================================================================== # Bellow is opensourced #============================================================================== try: with open('basicsite.csv', 'w') as dsite: wr = csv.writer(dsite, quoting=csv.QUOTE_ALL) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(qqll) rt = response raw_html = response.read() g = goose.Goose() a = g.extract(raw_html=raw_html) htext = a.cleaned_text opinion = TextBlob(htext) pol = opinion.sentiment.polarity sub = opinion.sentiment.subjectivity rt = requests.get(qqll).elapsed.total_seconds() kw = str(keywords(htext, lemmatize=True)) kw = kw.replace('\r', ' ').replace('\n', ' ') keyw = ' '.join(kw.split()[:3]) sbody = htext.replace(',', '') fkg = textstat.flesch_kincaid_grade(htext) wc = textstat.lexicon_count(htext) sc = textstat.sentence_count(htext) fre = textstat.flesch_reading_ease(htext) sinsite = [
def __init__ (self, url, db, collection): self.url = url connection = pymongo.Connection() collection_string = (connection.%s.%s)%(db, collection) self.collection = eval(collection_string) self.goose = goose.Goose()