def request(self): headers = prepare_headers(self.feed) # using google bot header to trick tumblr rss... headers['User-Agent'] = GOOGLE_BOT_UA return jarr_get(self.get_url(), timeout=conf.crawler.timeout, user_agent=conf.crawler.user_agent, headers=headers)
def request(self): headers = prepare_headers(self.feed) # using google bot header to trick tumblr rss... headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; "\ "+http://www.google.com/bot.html)" return jarr_get(self.get_url(), timeout=conf.crawler.timeout, user_agent=conf.crawler.user_agent, headers=headers)
def try_get_icon_url(url, *splits): for split in splits: if split is None: continue rb_url = rebuild_url(url, split) response = None # if html in content-type, we assume it's a fancy 404 page try: response = jarr_get(rb_url, conf.crawler.timeout, conf.crawler.user_agent) response.raise_for_status() content_type = response.headers.get('content-type', '') except Exception: logger.exception('something went wrong while fetching %r', rb_url) else: if response.ok and 'html' not in content_type and response.content: return response.url return None
def request(self): return jarr_get(self.get_url(), timeout=conf.crawler.timeout, user_agent=conf.crawler.user_agent, headers=prepare_headers(self.feed))
def http_get(url): try: return jarr_get(url) except (ReadTimeout, TimeoutError): return jarr_get(url, user_agent=GOOGLE_BOT_UA)