def raise_error(*args, **kwargs): from google.appengine.api import taskqueue raise taskqueue.TransientError()
def scrape_page(url_type, url, cookies): def set_removed(url): queries = [Item.query(Item.url == url), Category.query(Category.url == url)] keys = [] for query in queries: keys += query.fetch(keys_only=True) now = datetime.utcnow() @ndb.transactional def tx(key): ent = key.get() if not ent.removed: ent.removed = now ent.put() if isinstance(ent, Item): deferred.defer(index_items, [ent.key], _transactional=True, _queue='indexing', _countdown=2) logging.warn("%r: flagged removed" % ent.key) for key in keys: tx(key) retries = int(os.getenv('HTTP_X_APPENGINE_TASKRETRYCOUNT', 0)) headers = {} while True: if cookies: headers['Cookie'] = cookie_value(cookies) rs = urlfetch.fetch(url, headers=headers, follow_redirects=False, deadline=20) cookie = rs.headers.get('Set-Cookie') if cookie: cookies.load(cookie) content = rs.content.decode('utf-8') if url_type == PAGE_TYPE.ITEM: if rs.status_code == 200: try: scrape_item(url, content) except NoSKU: logging.warn("Item page scraping error", exc_info=True) set_removed(url) break elif rs.status_code in (301, 302): redir = rs.headers['Location'] logging.warn("Item redir (%d) %s -> %s" % (rs.status_code, url, redir)) set_removed(url) url = redir elif rs.status_code == 404 and retries > 1: set_removed(url) break else: raise taskqueue.TransientError( "%d for %s\nBody:\n%s\n\nHeaders:\n%r" % (rs.status_code, url, content.encode('ascii', 'xmlcharrefreplace')[:2000], rs.headers)) elif url_type == PAGE_TYPE.CATEGORY: if rs.status_code == 200: scrape_category(url, content) break elif rs.status_code in (301, 302): redir = rs.headers['Location'] logging.warn("Category redir (%d) %s -> %s" % (rs.status_code, url, redir)) set_removed(url) url = redir elif rs.status_code == 404 and retries > 1: set_removed(url) break else: raise taskqueue.TransientError( "%d for %s\nBody:\n%s\n\nHeaders:\n%r" % (rs.status_code, url, content.encode('ascii', 'xmlcharrefreplace')[:2000], rs.headers)) else: raise ValueError("Unknown URL type %r" % (url_type,))