def parse_content(self, response): """extract content of news by newspaper""" item = response.meta['item'] is_special, content = self._handle_special_site(response) if not is_special: # 不是特殊网站 article = Article(item['url'], language='zh') article.set_html(response.body) article.is_downloaded = True article.parse() item['pic'] = article.top_image item['content'] = str(article.text) item['publish_date'] = article.publish_date if publish_date: item['publish_date'] = publish_date.strftime( "%Y-%m-%d %H:%M:%S") else: item['publish_date'] = "null" else: item['pic'] = "" item['content'] = content # extract content failed if item['content'] == '': logging.error("empty content in: " + response.url) yield item # raw_content = response.xpath("//body//p/text()").extract() # item['content'] = ''.join(raw_content) item['content'] = item['content'].strip().replace(u"\xa0", "").replace(u"\u3000", "").replace("|", "")\ .replace("用微信扫码二维码分享至好友和朋友圈", "").strip("您当前的位置 :").strip("您所在的位置:").strip("提示:点击上方").strip(">").strip() yield item
def wrap_newspaper(self, web_page): parser = NewspaperArticle(url=web_page.final_url) parser.html = web_page.html parser.is_downloaded = True parser.parse() return parser
def clean(html_content): config = Configuration() config.fetch_images = False # TODO: allow URL passing article = Article("http://example.com", config=config) article.set_html(html_content) article.is_downloaded = True article.parse() return article.text
from newspaper import Article client = MongoClient() db_articles = client.news.articles db_web_cache = client.news.web_cache docs = db_articles.find() for doc in docs: print doc['_id'] if not doc['page']: continue url = doc['page']['urls'][0] web_cache_doc = db_web_cache.find_one({'url': url}) if 'html_compressed' in web_cache_doc: article = Article(url=url) article.html = bz2.decompress(web_cache_doc['html_compressed']) article.is_downloaded = True article.parse() doc['page']['text'] = article.text print len(doc['page']['text']) db_articles.save(doc)
import re from pymongo import MongoClient from newspaper import Article client = MongoClient() db_articles = client.news.articles db_web_cache = client.news.web_cache docs = db_articles.find() for doc in docs: print doc['_id'] if not doc['page']: continue url = doc['page']['urls'][0] web_cache_doc = db_web_cache.find_one({'url': url}) if 'html_compressed' in web_cache_doc: article = Article(url=url) article.html = bz2.decompress(web_cache_doc['html_compressed']) article.is_downloaded = True article.parse() doc['page']['text'] = article.text print len(doc['page']['text']) db_articles.save(doc)
def parse(url=None, html=None, text=None, title=None, sentences_count=5, options={}, summarize_algo="luhn", date_timezone="America/New_York"): """ Parse article to get relevant data :param url: :param html: :param text: :param title: :param sentences_count: :param options: {} :param summarize_algo: :param date_timezone: The timezone to convert the date to :return: """ article = Article("") if text and title: article.is_parsed = True article.is_downloaded = True article.set_title(title) article.set_text(text) else: if url: r = requests.get(url.strip()) if r.status_code != 200: raise Exception("Paper request failed '%s'" % url) html = r.content if html: soup = get_soup(html) else: raise Exception("Paper missing HTML content") article.set_html(remove_social_embeds(html)) article.parse() article.nlp() if options.get("title_selector"): title = soup.select(options.get("title_selector")) if title: title = title[0].text article.set_title(title) if options.get("image_selector"): img = soup.select(options.get("image_selector")) if img: img = img[0].text article.set_top_img_no_check(img) if options.get("content_selector"): html = soup.select(options.get("content_selector")) if html: article.set_text(html[0].text) summary = summarize(text=article.text, title=article.title, algo=summarize_algo, sentences_count=sentences_count) publish_date = article.publish_date if not publish_date and html: publish_date = extract_publish_date(html) if not publish_date: publish_date = datetime.datetime.now() return { "url": article.canonical_link, "title": article.title, "summary": summary, "summaries": summary.split("\n\n"), "text": article.text, "html": article.html, "top_image": article.top_image, "images": article.images, "videos": list(set(article.movies + extract_video_iframes(html))), "social_media_content": extract_social_media_content(html), "keywords": article.keywords, "tags": article.tags, "authors": article.authors, "published_date": datetime_to_local_timezone(publish_date), "md_text": "" }