def urlopen(self, url, timeout=30, params=None, headers=None, opener=None, multipart=False, show_error=True): url = urllib2.quote(ss(url), safe="%/:=&?~#+!$,;'@()*[]") if not headers: headers = {} if not params: params = {} # Fill in some headers parsed_url = urlparse(url) host = '%s%s' % (parsed_url.hostname, (':' + str(parsed_url.port) if parsed_url.port else '')) headers['Referer'] = headers.get('Referer', '%s://%s' % (parsed_url.scheme, host)) headers['Host'] = headers.get('Host', host) headers['User-Agent'] = headers.get('User-Agent', self.user_agent) headers['Accept-encoding'] = headers.get('Accept-encoding', 'gzip') headers['Connection'] = headers.get('Connection', 'keep-alive') headers['Cache-Control'] = headers.get('Cache-Control', 'max-age=0') # Don't try for failed requests if self.http_failed_disabled.get(host, 0) > 0: if self.http_failed_disabled[host] > (time.time() - 900): log.info2( 'Disabled calls to %s for 15 minutes because so many failed requests.', host) if not show_error: raise Exception( 'Disabled calls to %s for 15 minutes because so many failed requests' ) else: return '' else: del self.http_failed_request[host] del self.http_failed_disabled[host] self.wait(host) try: # Make sure opener has the correct headers if opener: opener.add_headers = headers if multipart: log.info('Opening multipart url: %s, params: %s', (url, [x for x in params.iterkeys()] if isinstance( params, dict) else 'with data')) request = urllib2.Request(url, params, headers) if opener: opener.add_handler(MultipartPostHandler()) else: cookies = cookielib.CookieJar() opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookies), MultipartPostHandler) response = opener.open(request, timeout=timeout) else: log.info('Opening url: %s, params: %s', (url, [x for x in params.iterkeys()] if isinstance( params, dict) else 'with data')) if isinstance(params, (str, unicode)) and len(params) > 0: data = params else: data = tryUrlencode(params) if len(params) > 0 else None request = urllib2.Request(url, data, headers) if opener: response = opener.open(request, timeout=timeout) else: response = urllib2.urlopen(request, timeout=timeout) # unzip if needed if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() f.close() else: data = response.read() response.close() self.http_failed_request[host] = 0 except IOError: if show_error: log.error('Failed opening url in %s: %s %s', (self.getName(), url, traceback.format_exc(1))) # Save failed requests by hosts try: if not self.http_failed_request.get(host): self.http_failed_request[host] = 1 else: self.http_failed_request[host] += 1 # Disable temporarily if self.http_failed_request[host] > 5 and not isLocalIP( host): self.http_failed_disabled[host] = time.time() except: log.debug('Failed logging failed requests for %s: %s', (url, traceback.format_exc())) raise self.http_last_use[host] = time.time() return data
def urlopen(self, url, timeout=30, params=None, headers=None, opener=None, multipart=False, show_error=True): url = ss(url) if not headers: headers = {} if not params: params = {} # Fill in some headers headers['Referer'] = headers.get('Referer', urlparse(url).hostname) headers['Host'] = headers.get('Host', urlparse(url).hostname) headers['User-Agent'] = headers.get( 'User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:10.0.2) Gecko/20100101 Firefox/10.0.2' ) headers['Accept-encoding'] = headers.get('Accept-encoding', 'gzip') host = urlparse(url).hostname # Don't try for failed requests if self.http_failed_disabled.get(host, 0) > 0: if self.http_failed_disabled[host] > (time.time() - 900): log.info2( 'Disabled calls to %s for 15 minutes because so many failed requests.', host) if not show_error: raise else: return '' else: del self.http_failed_request[host] del self.http_failed_disabled[host] self.wait(host) try: if multipart: log.info('Opening multipart url: %s, params: %s', (url, [x for x in params.iterkeys()] if isinstance( params, dict) else 'with data')) request = urllib2.Request(url, params, headers) if opener: opener.add_handler(MultipartPostHandler()) else: cookies = cookielib.CookieJar() opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookies), MultipartPostHandler) response = opener.open(request, timeout=timeout) else: log.info('Opening url: %s, params: %s', (url, [x for x in params.iterkeys()])) data = tryUrlencode(params) if len(params) > 0 else None request = urllib2.Request(url, data, headers) if opener: response = opener.open(request, timeout=timeout) else: response = urllib2.urlopen(request, timeout=timeout) # unzip if needed if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = response.read() self.http_failed_request[host] = 0 except IOError: if show_error: log.error('Failed opening url in %s: %s %s', (self.getName(), url, traceback.format_exc(1))) # Save failed requests by hosts try: if not self.http_failed_request.get(host): self.http_failed_request[host] = 1 else: self.http_failed_request[host] += 1 # Disable temporarily if self.http_failed_request[host] > 5: self.http_failed_disabled[host] = time.time() except: log.debug('Failed logging failed requests for %s: %s', (url, traceback.format_exc())) raise self.http_last_use[host] = time.time() return data