def feed_revisit(pool, dbi=None): """重下載必要的新聞,仿造 Base Ctlr :: dispatch_rss_2_0 meta 並轉由 dispatch_response 處理 @see db.list_revisits() @startpoint """ import json import importlib from lib import db, DB, logger from lib.util.dt import to_timestamp if dbi is None: _dbi = DB() else: _dbi = dbi ctlr_cache = {} i_created_on = 0 i_last_seen_on = 1 i_pub_ts = 2 i_feed_url = 3 i_canonical_url = 4 i_title = 5 i_meta = 6 i_ctlr = 7 # logger.info("Found %d articles to revisit" % len(revisit_list)) for x in db.list_recent_fetches(revisit_max_m(), dbi=dbi): expired = need_revisit(x[i_created_on], x[i_last_seen_on]) if (not expired): continue if (x[i_ctlr] not in ctlr_cache): (ns, cn) = x[i_ctlr].rsplit('.', 1) module = importlib.import_module(ns) ctlr_cache[x[i_ctlr]] = getattr(module, cn)() ctlr = ctlr_cache[x[i_ctlr]] meta = json.loads(x[i_meta]) meta['feed_url'] = x[i_feed_url] meta['pub_date'] = to_timestamp(x[i_pub_ts]) meta['title'] = x[i_title] logger.info('Revisiting %s, expired for %d min', x[i_canonical_url], expired, extra={'classname': feed_revisit}) pool.log_stats('with_revisit') pool.put("http://" + x[i_canonical_url], ctlr.dispatch_response, category="revisit", meta=meta) if dbi is None: _dbi.disconnect()
def feed_revisit(pool, dbi=None): """重下載必要的新聞,仿造 Base Ctlr :: dispatch_rss_2_0 meta 並轉由 dispatch_response 處理 @see db.list_revisits() @startpoint """ import json import importlib from lib import db, DB, logger from lib.util.dt import to_timestamp if dbi is None: _dbi = DB() else: _dbi = dbi ctlr_cache = {} i_created_on = 0 i_last_seen_on = 1 i_pub_ts = 2 i_feed_url = 3 i_canonical_url = 4 i_title = 5 i_meta = 6 i_ctlr = 7 # logger.info("Found %d articles to revisit" % len(revisit_list)) for x in db.list_recent_fetches(revisit_max_m(), dbi=dbi): expired = need_revisit(x[i_created_on], x[i_last_seen_on]) if not expired: continue if x[i_ctlr] not in ctlr_cache: (ns, cn) = x[i_ctlr].rsplit(".", 1) module = importlib.import_module(ns) ctlr_cache[x[i_ctlr]] = getattr(module, cn)() ctlr = ctlr_cache[x[i_ctlr]] meta = json.loads(x[i_meta]) meta["feed_url"] = x[i_feed_url] meta["pub_date"] = to_timestamp(x[i_pub_ts]) meta["title"] = x[i_title] logger.info("Revisiting %s, expired for %d min", x[i_canonical_url], expired, extra={"classname": feed_revisit}) pool.log_stats("with_revisit") pool.put("http://" + x[i_canonical_url], ctlr.dispatch_response, category="revisit", meta=meta) if dbi is None: _dbi.disconnect()
def dispatch_response(self, payload, pool, dbi): """ 處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。 輸入 payload 格式為 { 'src': 'RESPONSE_BODY', 'meta': { 'feed_url': '', 'pub_date': 'str' } } 輸出為 { 'html': lxml tree } @endpoint """ import lxml.html from lib import logger, util, db from lib.util.dt import to_timestamp from lib.util.text import to_unicode if not payload: pool.log_stats('error_fetch') return try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date']) except KeyError: pass # dom tree 前處理 try: html = lxml.html.fromstring( payload['src']) # lxml handles html encoding payload['src'] = to_unicode( payload['src']) # conver to unicode before storing except: extra = {'classname': self.__class__} logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra) logger.info("Got: %s", payload['src'], extra=extra) pool.log_stats('error_parse') return # canonical url url_canonical = html.cssselect('link[rel=canonical]') payload['url_canonical'] = url_canonical[0].attrib['href'] \ if len(url_canonical) > 0 else payload['url_read'] # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判 tags = html.cssselect('meta[http-equiv=Content-Type]') if (len(tags) > 0): payload['meta']['Content-Type'] = tags[0].attrib['content'] for x in tags: x.drop_tree() payload['html'] = html self.move_out_of_meta(payload, 'feed_url') article = self.parse_response(payload) if article: # parsed successfully self._decorate_article(article) db.save_article(article, dbi=dbi) pool.log_stats('done_article') else: # TODO: 還是寫入 article 表 db.save_response(payload, dbi=dbi) pool.log_stats('error_parse')
def dispatch_response(self, payload, pool, dbi): """ 處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。 輸入 payload 格式為 { 'src': 'RESPONSE_BODY', 'meta': { 'feed_url': '', 'pub_date': 'str' } } 輸出為 { 'html': lxml tree } @endpoint """ import lxml.html from lib import logger, util, db from lib.util.dt import to_timestamp from lib.util.text import to_unicode try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date']) except KeyError: pass # dom tree 前處理 try: html = lxml.html.fromstring(payload['src']) # lxml handles html encoding payload['src'] = to_unicode(payload['src']) # conver to unicode before storing except: extra = {'classname': self.__class__} logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra) logger.info("Got: %s", payload['src'], extra=extra) pool.log_stats('error_parse') return # canonical url url_canonical = html.cssselect('link[rel=canonical]') payload['url_canonical'] = url_canonical[0].attrib['href'] \ if len(url_canonical) > 0 else payload['url_read'] # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判 tags = html.cssselect('meta[http-equiv=Content-Type]') if (len(tags) > 0): payload['meta']['Content-Type'] = tags[0].attrib['content'] for x in tags: x.drop_tree() payload['html'] = html self.move_out_of_meta(payload, 'feed_url') article = self.parse_response(payload) if article: # parsed successfully self._decorate_article(article) db.save_article(article, dbi = dbi) pool.log_stats('done_article') else: # TODO: 還是寫入 article 表 db.save_response(payload, dbi = dbi) pool.log_stats('error_parse')