Exemplo n.º 1
0
def fetch(payload, dbi=None):
    """抓取 payload['url'] 的檔案
  並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src']
  """
    import re
    from lxml.html import fromstring

    from lib import db, DB, logger
    from lib.util.text import to_unicode

    extra = {'classname': 'util.net.fetch()'}

    try:
        uo = urlopen(payload['url'], timeout=HTTP_TIMEOUT)
        if (uo.code != 200):
            raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url))

        portal = get_portal(uo.url)
        if portal:
            break_portal(portal, payload, uo)
        else:
            payload['src'] = uo.read()
            payload['url_read'] = uo.url
    except Exception as e:
        # 抓取出錯,留待記錄 (save_fetch)
        payload['src'] = 'error ' + unicode(e)
        payload['category'] = 'error'
        payload['exception'] = e

    if 'url_read' not in payload:
        payload['url_read'] = payload['url']

    if dbi is None: _dbi = DB()
    else: _dbi = dbi

    try:
        db.save_fetch(payload['url'],
                      to_unicode(payload['src']),
                      payload['category'],
                      dbi=_dbi)
    except Exception as e:
        logger.warning('DB save_fetch failed for url %s' % payload['url'],
                       extra=extra)
        logger.debug(e)

    if dbi is None: _dbi.disconnect()

    if 'error' == payload['category']:
        # raise the exception to skip the parsing process
        logger.info("failed fetching %s" % payload['url'], extra=extra)
        raise payload['exception']

    return payload
Exemplo n.º 2
0
def fetch(payload, dbi = None):
  """抓取 payload['url'] 的檔案
  並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src']
  """
  import re
  from lxml.html import fromstring

  from lib import db, DB, logger
  from lib.util.text import to_unicode

  extra = {'classname': 'util.net.fetch()'}

  try:
    uo = urllib.urlopen(payload['url'])
    if (uo.code != 200):
      raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url))

    portal = get_portal(uo.url)
    if portal:
      break_portal(portal, payload, uo)
    else:
      payload['src'] = uo.read()
      payload['url_read'] = uo.url
  except Exception as e:
    # 抓取出錯,留待記錄 (save_fetch)
    payload['src'] = 'error ' + unicode(e)
    payload['category'] = 'error'
    payload['exception'] = e

  if 'url_read' not in payload:
    payload['url_read'] = payload['url']

  if dbi is None: _dbi = DB()
  else: _dbi = dbi
  
  try:
    db.save_fetch(payload['url'], to_unicode(payload['src']), payload['category'], dbi = _dbi)
  except Exception as e:
    logger.warning('DB save_fetch failed for url %s' % payload['url'], extra=extra)
    logger.debug(e)
  
  if dbi is None: _dbi.disconnect()

  if 'error' == payload['category']:
    # raise the exception to skip the parsing process
    logger.warning("failed fetching %s" % payload['url'], extra=extra)
    raise payload['exception']

  return payload
Exemplo n.º 3
0
    def dispatch_response(self, payload, pool, dbi):
        """
    處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。

    輸入 payload 格式為 {
      'src': 'RESPONSE_BODY',
      'meta': {
        'feed_url': '',
        'pub_date': 'str'
      }
    }
    輸出為 {
      'html': lxml tree
    }

    @endpoint
    """
        import lxml.html
        from lib import logger, util, db
        from lib.util.dt import to_timestamp
        from lib.util.text import to_unicode

        if not payload:
            pool.log_stats('error_fetch')
            return

        try:
            payload['pub_ts'] = to_timestamp(payload['meta']['pub_date'])
        except KeyError:
            pass

        # dom tree 前處理
        try:
            html = lxml.html.fromstring(
                payload['src'])  # lxml handles html encoding
            payload['src'] = to_unicode(
                payload['src'])  # conver to unicode before storing
        except:
            extra = {'classname': self.__class__}
            logger.warning("HTML parse error, url: %s",
                           payload['url_read'],
                           extra=extra)
            logger.info("Got: %s", payload['src'], extra=extra)
            pool.log_stats('error_parse')
            return

        # canonical url
        url_canonical = html.cssselect('link[rel=canonical]')
        payload['url_canonical'] = url_canonical[0].attrib['href'] \
          if len(url_canonical) > 0 else payload['url_read']

        # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判
        tags = html.cssselect('meta[http-equiv=Content-Type]')
        if (len(tags) > 0):
            payload['meta']['Content-Type'] = tags[0].attrib['content']
            for x in tags:
                x.drop_tree()

        payload['html'] = html

        self.move_out_of_meta(payload, 'feed_url')

        article = self.parse_response(payload)

        if article:
            # parsed successfully
            self._decorate_article(article)
            db.save_article(article, dbi=dbi)
            pool.log_stats('done_article')
        else:
            # TODO: 還是寫入 article 表
            db.save_response(payload, dbi=dbi)
            pool.log_stats('error_parse')
Exemplo n.º 4
0
  def dispatch_response(self, payload, pool, dbi):
    """
    處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。

    輸入 payload 格式為 {
      'src': 'RESPONSE_BODY',
      'meta': {
        'feed_url': '',
        'pub_date': 'str'
      }
    }
    輸出為 {
      'html': lxml tree
    }

    @endpoint
    """
    import lxml.html
    from lib import logger, util, db
    from lib.util.dt import to_timestamp
    from lib.util.text import to_unicode

    try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date'])
    except KeyError: pass

    # dom tree 前處理
    try:
      html = lxml.html.fromstring(payload['src']) # lxml handles html encoding
      payload['src'] = to_unicode(payload['src']) # conver to unicode before storing
    except:
      extra = {'classname': self.__class__}
      logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra)
      logger.info("Got: %s", payload['src'], extra=extra)
      pool.log_stats('error_parse')
      return

    # canonical url
    url_canonical = html.cssselect('link[rel=canonical]')
    payload['url_canonical'] = url_canonical[0].attrib['href'] \
      if len(url_canonical) > 0 else payload['url_read']

    # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判
    tags = html.cssselect('meta[http-equiv=Content-Type]')
    if (len(tags) > 0):
      payload['meta']['Content-Type'] = tags[0].attrib['content']
      for x in tags: x.drop_tree()

    payload['html'] = html

    self.move_out_of_meta(payload, 'feed_url')

    article = self.parse_response(payload)

    if article:
      # parsed successfully
      self._decorate_article(article)
      db.save_article(article, dbi = dbi)
      pool.log_stats('done_article')
    else:
      # TODO: 還是寫入 article 表
      db.save_response(payload, dbi = dbi)
      pool.log_stats('error_parse')