def extract_clean_content(content):
    global __version__
    # I found out about goose and readability here:
    # http://stackoverflow.com/questions/14164350/identifying-large-bodies-of-text-via-beautifulsoup-or-other-python-based-extract
    # The poster seems to like goose more.
    # One difference is that goose cleans up all the html, while readability
    # usually just remove cruft that isn't related to the article text.
    # There is a trade off between retaining links and formatting, and
    # getting cleaner text.
    # Readability seems to be better at finding the content in some cases
    # so it is used for initial cleaning, then goose is used since its
    # plain text output is easier to deal with downstream.
    method = None
    cleaned_content = ''
    ###### Readability code:
    readability_error = None
    try:
        document = readability.readability.Document(content)
        cleaner_content = document.summary().strip()
        if len(cleaner_content) > 50:
            content = cleaner_content
        else:
            readability_error = "Readability content too short: " + cleaner_content
    except readability.readability.Unparseable as e:
        readability_error = '\n'.join([str(i) for i in sys.exc_info()])
    except (lxml.etree.XMLSyntaxError, lxml.etree.DocumentInvalid,
            lxml.etree.ParserError) as e:
        readability_error = '\n'.join([str(i) for i in sys.exc_info()])
    except (AttributeError, ValueError, TypeError) as e:
        # This ought to be handled by readability.
        readability_error = '\n'.join([str(i) for i in sys.exc_info()])
    ######

    if not content.startswith('<html>'):
        content = '<html><body>' + content + '</body></html>'
    try:
        cleaned_content = goose.Goose({
            'parser_class': 'soup',
            'enable_image_fetching': False,
        }).extract(raw_html=content).cleaned_text
    except ValueError:
        cleaned_content = ''
    if len(cleaned_content) < 1:
        # Goose doesn't do well with foreign language content.
        # If we can't find content with goose try extracting
        # all the text with Beautiful soup.
        # Beautiful soup doesn't attempt to extract the article,
        # it just finds all the text in the html, which seems to be
        # good enough since we've already used readability on the articles.
        content = re.sub('\<br\s?\/?\>', '\n', content)
        cleaned_content = BeautifulSoup(content).text
    return {
        'clearnerVersion': __version__,
        'method': method,
        'content': cleaned_content,
        'readability_error': readability_error,
        # Malformed should be true whenever we can detect an issue with the
        # content that was extracted.
        'malformed': len(cleaned_content) < 50
    }
示例#2
0
 def parse(url):
     page = requests.get(url)  #, proxies = proxies)
     g = goose.Goose()
     #article = g.extract(url=url)
     article = g.extract(raw_html=page.text)
     items = {}
     items['headline'] = article.title
     items['text'] = ArticleExtractor.filter_unicode(article.cleaned_text)
     return items
示例#3
0
def goosefy(content, article):
    cleaned_text = ''
    if len(content.strip()) > 0:
        cleaned_text = goose.Goose({'enable_image_fetching': False, 'use_meta_language': False,
                                'target_language': 'pt'}).extract(raw_html=content).cleaned_text
    if len(cleaned_text) == 0:
        if article.has_key('summary'):
            cleaned_text = article['summary']

    return cleaned_text
示例#4
0
def parse_article(url):
    g = goose.Goose()
    a = g.extract(url)
    return {
        'url': url,
        'author': ','.join(a.authors),
        'date': str(dateparser.parse(a.publish_date or '')),
        'text': a.cleaned_text.replace('. ', '.\n '),
        'title': a.title,
        'links': a.links,
        'topics': parse_text(a.cleaned_text),
        'html': tostring(a.doc),
        'domain': get_domain(url),
        'raw_html': a.raw_html
    }
示例#5
0
# -*- coding: utf-8 -*-
"""
 @Time: 2019/6/21 13:54
"""

import time
import os
import sys

import urllib2
import goose
from goose.text import StopWordsChinese

url = "http://www.sohu.com/a/299667318_501931"
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
response = opener.open(url)
raw_html = response.read()

g = goose.Goose({'stopwords_class': StopWordsChinese})
a = g.extract(raw_html=raw_html)
print(a.infos)
print(a.title)
print("--"*20)
print(a.cleaned_text)
        print 'The server couldn\'t fulfill the request.'
        print 'Error code: ', e.code
else:
    print 'URL is good!'

#==============================================================================
# Bellow is opensourced
#==============================================================================
try:
    with open('basicsite.csv', 'w') as dsite:
        wr = csv.writer(dsite, quoting=csv.QUOTE_ALL)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
        response = opener.open(qqll)
        rt = response
        raw_html = response.read()
        g = goose.Goose()
        a = g.extract(raw_html=raw_html)
        htext = a.cleaned_text
        opinion = TextBlob(htext)
        pol = opinion.sentiment.polarity
        sub = opinion.sentiment.subjectivity
        rt = requests.get(qqll).elapsed.total_seconds()
        kw = str(keywords(htext, lemmatize=True))
        kw = kw.replace('\r', ' ').replace('\n', ' ')
        keyw = ' '.join(kw.split()[:3])
        sbody = htext.replace(',', '')
        fkg = textstat.flesch_kincaid_grade(htext)
        wc = textstat.lexicon_count(htext)
        sc = textstat.sentence_count(htext)
        fre = textstat.flesch_reading_ease(htext)
        sinsite = [
示例#7
0
	def __init__ (self, url, db, collection):
		self.url = url
		connection = pymongo.Connection()
		collection_string = (connection.%s.%s)%(db, collection)
		self.collection = eval(collection_string)
		self.goose = goose.Goose()