示例#1
0
 def raise_error(*args, **kwargs):
     from google.appengine.api import taskqueue
     raise taskqueue.TransientError()
示例#2
0
def scrape_page(url_type, url, cookies):
    def set_removed(url):
        queries = [Item.query(Item.url == url),
                   Category.query(Category.url == url)]
        keys = []
        for query in queries:
            keys += query.fetch(keys_only=True)

        now = datetime.utcnow()

        @ndb.transactional
        def tx(key):
            ent = key.get()
            if not ent.removed:
                ent.removed = now
                ent.put()
                if isinstance(ent, Item):
                    deferred.defer(index_items,
                                   [ent.key],
                                   _transactional=True,
                                   _queue='indexing',
                                   _countdown=2)
                logging.warn("%r: flagged removed" % ent.key)

        for key in keys:
            tx(key)

    retries = int(os.getenv('HTTP_X_APPENGINE_TASKRETRYCOUNT', 0))
    headers = {}

    while True:
        if cookies:
            headers['Cookie'] = cookie_value(cookies)

        rs = urlfetch.fetch(url,
                            headers=headers,
                            follow_redirects=False,
                            deadline=20)

        cookie = rs.headers.get('Set-Cookie')
        if cookie:
            cookies.load(cookie)

        content = rs.content.decode('utf-8')
        if url_type == PAGE_TYPE.ITEM:
            if rs.status_code == 200:
                try:
                    scrape_item(url, content)
                except NoSKU:
                    logging.warn("Item page scraping error", exc_info=True)
                    set_removed(url)
                break
            elif rs.status_code in (301, 302):
                redir = rs.headers['Location']
                logging.warn("Item redir (%d) %s -> %s"
                             % (rs.status_code, url, redir))
                set_removed(url)
                url = redir
            elif rs.status_code == 404 and retries > 1:
                set_removed(url)
                break
            else:
                raise taskqueue.TransientError(
                          "%d for %s\nBody:\n%s\n\nHeaders:\n%r"
                          % (rs.status_code,
                             url,
                             content.encode('ascii', 'xmlcharrefreplace')[:2000],
                             rs.headers))

        elif url_type == PAGE_TYPE.CATEGORY:
            if rs.status_code == 200:
                scrape_category(url, content)
                break
            elif rs.status_code in (301, 302):
                redir = rs.headers['Location']
                logging.warn("Category redir (%d) %s -> %s"
                             % (rs.status_code, url, redir))
                set_removed(url)
                url = redir
            elif rs.status_code == 404 and retries > 1:
                set_removed(url)
                break
            else:
                raise taskqueue.TransientError(
                          "%d for %s\nBody:\n%s\n\nHeaders:\n%r"
                          % (rs.status_code,
                             url,
                             content.encode('ascii', 'xmlcharrefreplace')[:2000],
                             rs.headers))

        else:
            raise ValueError("Unknown URL type %r" % (url_type,))