def process_request_result(self, prepare_response_func=None): """ Process result of real request performed via transport extension. """ now = datetime.now() # TODO: move into separate method if self.config['debug_post']: post = self.config['post'] or self.config['multipart_post'] if isinstance(post, dict): post = list(post.items()) if post: if isinstance(post, basestring): post = post[:self.config['debug_post_limit']] + '...' else: items = normalize_http_values(post, charset='utf-8') new_items = [] for key, value in items: if len(value) > self.config['debug_post_limit']: value = value[:self. config['debug_post_limit']] + '...' else: value = value new_items.append((key, value)) post = '\n'.join('%-25s: %s' % x for x in new_items) if post: logger_network.debug('[%02d] POST request:\n%s\n' % (self.request_counter, post)) # It's important to delete old POST data after request is performed. # If POST data is not cleared then next request will try to use them # again! old_refresh_count = self.config['refresh_redirect_count'] self.reset_temporary_options() if prepare_response_func: self.doc = prepare_response_func(self.transport, self) else: self.doc = self.transport.prepare_response(self) # Workaround if self.doc.grab is None: self.doc.grab = weakref.proxy(self) if self.config['reuse_cookies']: self.cookies.update(self.doc.cookies) self.doc.timestamp = now self.config['charset'] = self.doc.charset if self.config['log_file']: with open(self.config['log_file'], 'wb') as out: out.write(self.doc.body) if self.config['cookiefile']: self.cookies.save_to_file(self.config['cookiefile']) if self.config['reuse_referer']: self.config['referer'] = self.doc.url self.copy_request_data() # Should be called after `copy_request_data` self.save_dumps() # TODO: check max redirect count if self.config['follow_refresh']: url = find_refresh_url(self.doc.unicode_body()) print('URL', url) if url is not None: inc_count = old_refresh_count + 1 if inc_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: print(inc_count) return self.request(url=url, refresh_redirect_count=inc_count) return None
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (unicode(ex), grab.config['url'])) # py3 hack if not PY3K: request_url = smart_str(request_url) self.curl.setopt(pycurl.URL, request_url) self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) #self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file(grab.config['body_storage_dir'], grab.config['body_storage_filename']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = random_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error #self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], basestring): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset']) # py3 hack if PY3K: post_items = decode_pairs(post_items, grab.config['charset']) #import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, post_items) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack #if PY3K: # post_data = smart_unicode(post_data, grab.config['charset']) self.curl.setopt(pycurl.COPYPOSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, unicode) or (not PY3K and not isinstance(data, basestring)): # py3 hack #if PY3K: # data = data.encode('utf-8') #else: raise error.GrabMisuseError('Value of post option could be only '\ 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, unicode) or not isinstance(data, basestring): # py3 hack if PY3K: data = data.encode('utf-8') else: raise error.GrabMisuseError('Value of post option could be only byte '\ 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'delete') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) header_tuples = [str('%s: %s' % x) for x\ in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: ptype = getattr(pycurl, 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()) self.curl.setopt(pycurl.PROXYTYPE, ptype) if grab.config['encoding']: if 'gzip' in grab.config[ 'encoding'] and not 'zlib' in pycurl.version: raise error.GrabMisuseError('You can not use gzip encoding because '\ 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def process_config(self, grab): """ Setup curl instance with values from ``grab.config``. """ # Accumulate all request options into `self.requests_config` self.requests_config = { 'headers': {}, 'payload': None, 'cookies': None, 'proxy': None, } if isinstance(grab.config['url'], unicode): grab.config['url'] = grab.config['url'].encode('utf-8') self.requests_config['url'] = grab.config['url'] #self.curl.setopt(pycurl.URL, url) #self.curl.setopt(pycurl.FOLLOWLOCATION, 1) #self.curl.setopt(pycurl.MAXREDIRS, 5) #self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) #self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) #self.curl.setopt(pycurl.NOSIGNAL, 1) #self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) #self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) # User-Agent # TODO: move to base class if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: lines = open(grab.config['user_agent_file']).read().splitlines() grab.config['user_agent'] = random.choice(lines) # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" # For consistency we send empty User-Agent in case of None value # in all other transports too if not grab.config['user_agent']: grab.config['user_agent'] = '' self.requests_config['headers']['User-Agent'] = grab.config['user_agent'] #if grab.config['debug']: #self.curl.setopt(pycurl.VERBOSE, 1) #self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) ## Ignore SSL errors #self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) #self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) self.requests_config['method'] = grab.request_method.lower() if grab.request_method == 'POST' or grab.request_method == 'PUT': if grab.config['multipart_post']: raise NotImplementedError #if isinstance(grab.config['multipart_post'], basestring): #raise GrabMisuseError('multipart_post option could not be a string') #post_items = normalize_http_values(grab.config['multipart_post'], #charset=grab.config['charset']) #self.curl.setopt(pycurl.HTTPPOST, post_items) elif grab.config['post']: if isinstance(grab.config['post'], basestring): # bytes-string should be posted as-is # unicode should be converted into byte-string if isinstance(grab.config['post'], unicode): post_data = normalize_unicode(grab.config['post']) else: post_data = grab.config['post'] else: # dict, tuple, list should be serialized into byte-string post_data = urlencode(grab.config['post']) self.requests_config['payload'] = post_data #self.curl.setopt(pycurl.POSTFIELDS, post_data) #elif grab.request_method == 'PUT': #self.curl.setopt(pycurl.PUT, 1) #self.curl.setopt(pycurl.READFUNCTION, StringIO(grab.config['post']).read) elif grab.request_method == 'DELETE': pass #self.curl.setopt(pycurl.CUSTOMREQUEST, 'delete') elif grab.request_method == 'HEAD': pass #self.curl.setopt(pycurl.NOBODY, 1) else: pass #self.curl.setopt(pycurl.HTTPGET, 1) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) #header_tuples = [str('%s: %s' % x) for x\ #in headers.iteritems()] #self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.requests_config['headers'].update(headers) # `cookiefile` option shoul be processed before `cookies` option # because `load_cookies` updates `cookies` option if grab.config['cookiefile']: grab.load_cookies(grab.config['cookiefile']) if grab.config['cookies']: items = normalize_http_values(grab.config['cookies']) self.requests_config['cookies'] = dict(items) #if not grab.config['reuse_cookies'] and not grab.config['cookies']: #self.curl.setopt(pycurl.COOKIELIST, 'ALL') #if grab.config['referer']: #self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) #if grab.config['proxy']: #self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) #else: #self.curl.setopt(pycurl.PROXY, '') #if grab.config['proxy_userpwd']: #self.curl.setopt(pycurl.PROXYUSERPWD, grab.config['proxy_userpwd']) if grab.config['proxy']: self.requests_config['proxy'] = grab.config['proxy'] if grab.config['proxy_userpwd']: raise GrabMisuseError('requests transport does not support proxy authentication') if grab.config['proxy_type']: if grab.config['proxy_type'] != 'http': raise GrabMisuseError('requests transport supports only proxies of http type')
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (unicode(ex), grab.config['url'])) # py3 hack if not PY3K: request_url = smart_str(request_url) self.curl.setopt(pycurl.URL, request_url) self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) #self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError('Option body_storage_dir is not defined') self.setup_body_file(grab.config['body_storage_dir'], grab.config['body_storage_filename']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = random_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error #self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], basestring): raise error.GrabMisuseError('multipart_post option could not be a string') post_items = normalize_http_values(grab.config['multipart_post'], charset=grab.config['charset']) # py3 hack if PY3K: post_items = decode_pairs(post_items, grab.config['charset']) #import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, post_items) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack #if PY3K: # post_data = smart_unicode(post_data, grab.config['charset']) self.curl.setopt(pycurl.COPYPOSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, unicode) or (not PY3K and not isinstance(data, basestring)): # py3 hack #if PY3K: # data = data.encode('utf-8') #else: raise error.GrabMisuseError('Value of post option could be only '\ 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, unicode) or not isinstance(data, basestring): # py3 hack if PY3K: data = data.encode('utf-8') else: raise error.GrabMisuseError('Value of post option could be only byte '\ 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'delete') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) header_tuples = [str('%s: %s' % x) for x\ in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: ptype = getattr(pycurl, 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()) self.curl.setopt(pycurl.PROXYTYPE, ptype) if grab.config['encoding']: if 'gzip' in grab.config['encoding'] and not 'zlib' in pycurl.version: raise error.GrabMisuseError('You can not use gzip encoding because '\ 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def process_request_result(self, prepare_response_func=None): """ Process result of real request performed via transport extension. """ now = datetime.now() # TODO: move into separate method if self.config['debug_post']: post = self.config['post'] or self.config['multipart_post'] if isinstance(post, dict): post = list(post.items()) if post: if isinstance(post, basestring): post = post[:self.config['debug_post_limit']] + '...' else: items = normalize_http_values(post, charset='utf-8') new_items = [] for key, value in items: if len(value) > self.config['debug_post_limit']: value = value[:self.config['debug_post_limit']] + '...' else: value = value new_items.append((key, value)) post = '\n'.join('%-25s: %s' % x for x in new_items) if post: logger_network.debug('[%02d] POST request:\n%s\n' % (self.request_counter, post)) # It's important to delete old POST data after request is performed. # If POST data is not cleared then next request will try to use them again! old_refresh_count = self.config['refresh_redirect_count'] self.reset_temporary_options() if prepare_response_func: self.doc = prepare_response_func(self.transport, self) else: self.doc = self.transport.prepare_response(self) # Warkaround if self.doc.grab is None: self.doc.grab = weakref.proxy(self) if self.config['reuse_cookies']: self.cookies.update(self.doc.cookies) self.doc.timestamp = now self.config['charset'] = self.doc.charset if self.config['log_file']: with open(self.config['log_file'], 'wb') as out: out.write(self.doc.body) if self.config['cookiefile']: self.cookies.save_to_file(self.config['cookiefile']) if self.config['reuse_referer']: self.config['referer'] = self.doc.url self.copy_request_data() # Should be called after `copy_request_data` self.save_dumps() # TODO: check max redirect count if self.config['follow_refresh']: url = find_refresh_url(self.doc.unicode_body()) print('URL', url) if url is not None: inc_count = old_refresh_count + 1 if inc_count > self.config['redirect_limit']: raise error.GrabTooManyRedirectsError() else: print(inc_count) return self.request(url=url, refresh_redirect_count=inc_count) return None