def fetch(payload, dbi=None): """抓取 payload['url'] 的檔案 並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src'] """ import re from lxml.html import fromstring from lib import db, DB, logger from lib.util.text import to_unicode extra = {'classname': 'util.net.fetch()'} try: uo = urlopen(payload['url'], timeout=HTTP_TIMEOUT) if (uo.code != 200): raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url)) portal = get_portal(uo.url) if portal: break_portal(portal, payload, uo) else: payload['src'] = uo.read() payload['url_read'] = uo.url except Exception as e: # 抓取出錯,留待記錄 (save_fetch) payload['src'] = 'error ' + unicode(e) payload['category'] = 'error' payload['exception'] = e if 'url_read' not in payload: payload['url_read'] = payload['url'] if dbi is None: _dbi = DB() else: _dbi = dbi try: db.save_fetch(payload['url'], to_unicode(payload['src']), payload['category'], dbi=_dbi) except Exception as e: logger.warning('DB save_fetch failed for url %s' % payload['url'], extra=extra) logger.debug(e) if dbi is None: _dbi.disconnect() if 'error' == payload['category']: # raise the exception to skip the parsing process logger.info("failed fetching %s" % payload['url'], extra=extra) raise payload['exception'] return payload
def fetch(payload, dbi = None): """抓取 payload['url'] 的檔案 並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src'] """ import re from lxml.html import fromstring from lib import db, DB, logger from lib.util.text import to_unicode extra = {'classname': 'util.net.fetch()'} try: uo = urllib.urlopen(payload['url']) if (uo.code != 200): raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url)) portal = get_portal(uo.url) if portal: break_portal(portal, payload, uo) else: payload['src'] = uo.read() payload['url_read'] = uo.url except Exception as e: # 抓取出錯,留待記錄 (save_fetch) payload['src'] = 'error ' + unicode(e) payload['category'] = 'error' payload['exception'] = e if 'url_read' not in payload: payload['url_read'] = payload['url'] if dbi is None: _dbi = DB() else: _dbi = dbi try: db.save_fetch(payload['url'], to_unicode(payload['src']), payload['category'], dbi = _dbi) except Exception as e: logger.warning('DB save_fetch failed for url %s' % payload['url'], extra=extra) logger.debug(e) if dbi is None: _dbi.disconnect() if 'error' == payload['category']: # raise the exception to skip the parsing process logger.warning("failed fetching %s" % payload['url'], extra=extra) raise payload['exception'] return payload
def dispatch_response(self, payload, pool, dbi): """ 處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。 輸入 payload 格式為 { 'src': 'RESPONSE_BODY', 'meta': { 'feed_url': '', 'pub_date': 'str' } } 輸出為 { 'html': lxml tree } @endpoint """ import lxml.html from lib import logger, util, db from lib.util.dt import to_timestamp from lib.util.text import to_unicode if not payload: pool.log_stats('error_fetch') return try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date']) except KeyError: pass # dom tree 前處理 try: html = lxml.html.fromstring( payload['src']) # lxml handles html encoding payload['src'] = to_unicode( payload['src']) # conver to unicode before storing except: extra = {'classname': self.__class__} logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra) logger.info("Got: %s", payload['src'], extra=extra) pool.log_stats('error_parse') return # canonical url url_canonical = html.cssselect('link[rel=canonical]') payload['url_canonical'] = url_canonical[0].attrib['href'] \ if len(url_canonical) > 0 else payload['url_read'] # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判 tags = html.cssselect('meta[http-equiv=Content-Type]') if (len(tags) > 0): payload['meta']['Content-Type'] = tags[0].attrib['content'] for x in tags: x.drop_tree() payload['html'] = html self.move_out_of_meta(payload, 'feed_url') article = self.parse_response(payload) if article: # parsed successfully self._decorate_article(article) db.save_article(article, dbi=dbi) pool.log_stats('done_article') else: # TODO: 還是寫入 article 表 db.save_response(payload, dbi=dbi) pool.log_stats('error_parse')
def dispatch_response(self, payload, pool, dbi): """ 處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。 輸入 payload 格式為 { 'src': 'RESPONSE_BODY', 'meta': { 'feed_url': '', 'pub_date': 'str' } } 輸出為 { 'html': lxml tree } @endpoint """ import lxml.html from lib import logger, util, db from lib.util.dt import to_timestamp from lib.util.text import to_unicode try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date']) except KeyError: pass # dom tree 前處理 try: html = lxml.html.fromstring(payload['src']) # lxml handles html encoding payload['src'] = to_unicode(payload['src']) # conver to unicode before storing except: extra = {'classname': self.__class__} logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra) logger.info("Got: %s", payload['src'], extra=extra) pool.log_stats('error_parse') return # canonical url url_canonical = html.cssselect('link[rel=canonical]') payload['url_canonical'] = url_canonical[0].attrib['href'] \ if len(url_canonical) > 0 else payload['url_read'] # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判 tags = html.cssselect('meta[http-equiv=Content-Type]') if (len(tags) > 0): payload['meta']['Content-Type'] = tags[0].attrib['content'] for x in tags: x.drop_tree() payload['html'] = html self.move_out_of_meta(payload, 'feed_url') article = self.parse_response(payload) if article: # parsed successfully self._decorate_article(article) db.save_article(article, dbi = dbi) pool.log_stats('done_article') else: # TODO: 還是寫入 article 表 db.save_response(payload, dbi = dbi) pool.log_stats('error_parse')