示例#1
0
 def parse_content(self, response):
     """extract content of news by newspaper"""
     item = response.meta['item']
     is_special, content = self._handle_special_site(response)
     if not is_special:
         # 不是特殊网站
         article = Article(item['url'], language='zh')
         article.set_html(response.body)
         article.is_downloaded = True
         article.parse()
         item['pic'] = article.top_image
         item['content'] = str(article.text)
         item['publish_date'] = article.publish_date
         if publish_date:
             item['publish_date'] = publish_date.strftime(
                 "%Y-%m-%d %H:%M:%S")
         else:
             item['publish_date'] = "null"
     else:
         item['pic'] = ""
         item['content'] = content
     # extract content failed
     if item['content'] == '':
         logging.error("empty content in: " + response.url)
         yield item
         # raw_content = response.xpath("//body//p/text()").extract()
         # item['content'] = ''.join(raw_content)
     item['content'] = item['content'].strip().replace(u"\xa0", "").replace(u"\u3000", "").replace("|", "")\
         .replace("用微信扫码二维码分享至好友和朋友圈", "").strip("您当前的位置 :").strip("您所在的位置:").strip("提示:点击上方").strip(">").strip()
     yield item
示例#2
0
文件: page.py 项目: tfgg/ppsay
    def wrap_newspaper(self, web_page):
        parser = NewspaperArticle(url=web_page.final_url)
        parser.html = web_page.html
        parser.is_downloaded = True
        parser.parse()

        return parser
示例#3
0
    def wrap_newspaper(self, web_page):
        parser = NewspaperArticle(url=web_page.final_url)
        parser.html = web_page.html
        parser.is_downloaded = True
        parser.parse()

        return parser
示例#4
0
def clean(html_content):
    config = Configuration()
    config.fetch_images = False

    # TODO: allow URL passing
    article = Article("http://example.com", config=config)
    article.set_html(html_content)
    article.is_downloaded = True
    article.parse()

    return article.text
示例#5
0
from newspaper import Article

client = MongoClient()

db_articles = client.news.articles
db_web_cache = client.news.web_cache

docs = db_articles.find()

for doc in docs:
    print doc['_id']

    if not doc['page']:
        continue

    url = doc['page']['urls'][0]
    web_cache_doc = db_web_cache.find_one({'url': url})
    
    if 'html_compressed' in web_cache_doc:
        article = Article(url=url)
        article.html = bz2.decompress(web_cache_doc['html_compressed'])
        article.is_downloaded = True
        article.parse()

        doc['page']['text'] = article.text
        print len(doc['page']['text'])

        db_articles.save(doc)


示例#6
0
import re
from pymongo import MongoClient
from newspaper import Article

client = MongoClient()

db_articles = client.news.articles
db_web_cache = client.news.web_cache

docs = db_articles.find()

for doc in docs:
    print doc['_id']

    if not doc['page']:
        continue

    url = doc['page']['urls'][0]
    web_cache_doc = db_web_cache.find_one({'url': url})

    if 'html_compressed' in web_cache_doc:
        article = Article(url=url)
        article.html = bz2.decompress(web_cache_doc['html_compressed'])
        article.is_downloaded = True
        article.parse()

        doc['page']['text'] = article.text
        print len(doc['page']['text'])

        db_articles.save(doc)
示例#7
0
def parse(url=None, html=None, text=None, title=None,
          sentences_count=5,
          options={},
          summarize_algo="luhn",
          date_timezone="America/New_York"):
    """
    Parse article to get relevant data

    :param url:
    :param html:
    :param text:
    :param title:
    :param sentences_count:
    :param options: {}
    :param summarize_algo:
    :param date_timezone: The timezone to convert the date to
    :return:
    """

    article = Article("")

    if text and title:
        article.is_parsed = True
        article.is_downloaded = True
        article.set_title(title)
        article.set_text(text)
    else:
        if url:
            r = requests.get(url.strip())
            if r.status_code != 200:
                raise Exception("Paper request failed '%s'" % url)
            html = r.content

        if html:
            soup = get_soup(html)
        else:
            raise Exception("Paper missing HTML content")

        article.set_html(remove_social_embeds(html))
        article.parse()
        article.nlp()

        if options.get("title_selector"):
            title = soup.select(options.get("title_selector"))
            if title:
                title = title[0].text
                article.set_title(title)

        if options.get("image_selector"):
            img = soup.select(options.get("image_selector"))
            if img:
                img = img[0].text
                article.set_top_img_no_check(img)

        if options.get("content_selector"):
            html = soup.select(options.get("content_selector"))
            if html:
                article.set_text(html[0].text)

    summary = summarize(text=article.text,
                        title=article.title,
                        algo=summarize_algo,
                        sentences_count=sentences_count)
    publish_date = article.publish_date
    if not publish_date and html:
        publish_date = extract_publish_date(html)
    if not publish_date:
        publish_date = datetime.datetime.now()

    return {
        "url": article.canonical_link,
        "title": article.title,
        "summary": summary,
        "summaries": summary.split("\n\n"),
        "text": article.text,
        "html": article.html,
        "top_image": article.top_image,
        "images": article.images,
        "videos": list(set(article.movies + extract_video_iframes(html))),
        "social_media_content": extract_social_media_content(html),
        "keywords": article.keywords,
        "tags": article.tags,
        "authors": article.authors,
        "published_date": datetime_to_local_timezone(publish_date),
        "md_text": ""
    }