def text_search(self, anchor, byte=False): """ Search the substring in response body. :param anchor: string to search :param byte: if False then `anchor` should be the unicode string, and search will be performed in `response.unicode_body()` else `anchor` should be the byte-string and search will be performed in `response.body` If substring is found return True else False. """ if isinstance(anchor, six.text_type): if byte: raise GrabMisuseError('The anchor should be bytes string in ' 'byte mode') else: return anchor in self.unicode_body() if not isinstance(anchor, six.text_type): if byte: # if six.PY3: # return anchor in self.body_as_bytes() return anchor in self.body else: raise GrabMisuseError('The anchor should be byte string in ' 'non-byte mode')
def xpath_number(self, path, default=NULL, filter=None, ignore_spaces=False, smart=False, make_int=True): if filter is not None: raise GrabMisuseError('Argument `filter` is not supported anymore') return self.doc.select(path).number(default=default, smart=smart, ignore_spaces=ignore_spaces, make_int=make_int)
def create_cookie(name, value, domain, httponly=None, **kwargs): """Creates `cookielib.Cookie` instance""" if domain == 'localhost': domain = '' config = dict( name=name, value=value, version=0, port=None, domain=domain, path='/', secure=False, expires=None, discard=True, comment=None, comment_url=None, rfc2109=False, rest={'HttpOnly': httponly}, ) for key in kwargs: if key not in config: raise GrabMisuseError('Function `create_cookie` does not accept ' '`%s` argument' % key) config.update(**kwargs) config['rest']['HttpOnly'] = httponly config['port_specified'] = bool(config['port']) config['domain_specified'] = bool(config['domain']) config['domain_initial_dot'] = (config['domain'] or '').startswith('.') config['path_specified'] = bool(config['path']) return Cookie(**config)
def __init__(self, source, source_type, proxy_type='http', **kwargs): """ Create `ProxyList` object and load proxies from the specified source. You should specify type of source in second argument to let ProxyList instance know how to handle proxy source. :param source: source of the project (file name, string or some object) :param source_type: type of proxy source :param proxy_type: default type of proxy (if proxy source does not provide this information) :param **kwargs: any additional arguments goes to specific proxy load method """ self.init_kwargs = deepcopy(kwargs) try: source_class = SOURCE_LIST[source_type] except AttributeError: raise GrabMisuseError('Unknown proxy source type: %s' % source_type) self.source = source_class(source, proxy_type=proxy_type, **kwargs) self.source.load() self.filter_config = {} self.geoip_resolver = None
def _build_selector(cls, tree, selector_type): if selector_type == 'xpath': return XpathSelector(tree) elif selector_type == 'json': return JsonSelector(tree) else: raise GrabMisuseError('Unknown selector type: %s' % selector_type)
def choose_form(self, number=None, xpath=None, name=None, **kwargs): """ Set the default form. :param number: number of form (starting from zero) :param id: value of "id" attribute :param name: value of "name" attribute :param xpath: XPath query :raises: :class:`DataNotFound` if form not found :raises: :class:`GrabMisuseError` if method is called without parameters Selected form will be available via `form` attribute of `Grab` instance. All form methods will work with default form. Examples:: # Select second form g.choose_form(1) # Select by id g.choose_form(id="register") # Select by name g.choose_form(name="signup") # Select by xpath g.choose_form(xpath='//form[contains(@action, "/submit")]') """ id_ = kwargs.pop('id', None) if id_ is not None: try: self._lxml_form = self.select('//form[@id="%s"]' % id_).node() except IndexError: raise DataNotFound("There is no form with id: %s" % id_) elif name is not None: try: self._lxml_form = self.select( '//form[@name="%s"]' % name).node() except IndexError: raise DataNotFound('There is no form with name: %s' % name) elif number is not None: try: self._lxml_form = self.tree.forms[number] except IndexError: raise DataNotFound('There is no form with number: %s' % number) elif xpath is not None: try: self._lxml_form = self.select(xpath).node() except IndexError: raise DataNotFound( 'Could not find form with xpath: %s' % xpath) else: raise GrabMisuseError('choose_form methods requires one of ' '[number, id, name, xpath] arguments')
def remove_bom(text): """ Remove BOM-sequence from the start of byte string. """ if isinstance(text, unicode): raise GrabMisuseError('remove_bom function accepts only byte strings') if text.startswith(BOM_TOKEN): return text[3:] else: return text
def _write_body(self, body): if isinstance(body, six.text_type): raise GrabMisuseError('Document.body could be only byte string.') elif self.body_path: with open(self.body_path, 'wb') as out: out.write(body) self._bytes_body = None else: self._bytes_body = body self._unicode_body = None
def update(self, cookies): if isinstance(cookies, CookieJar): for cookie in cookies: self.cookiejar.set_cookie(cookie) elif isinstance(cookies, CookieManager): for cookie in cookies.cookiejar: self.cookiejar.set_cookie(cookie) else: raise GrabMisuseError('Unknown type of cookies argument: %s' % type(cookies))
def xpath_text(self, path, default=NULL, filter=None, smart=False, normalize_space=True): if filter is not None: raise GrabMisuseError('Argument `filter` is not supported anymore') return self.doc.select(path).text(default=default, smart=smart, normalize_space=normalize_space)
def func_field(*args, **kwargs): if not kwargs and len(args) == 1 \ and isinstance(args[0], collections.Callable): raise GrabMisuseError('It seems that you forgot to "call" the ' 'func_field decorator. Use "@func_field()" ' 'instead "func_field".') def wrapper(func): kwargs['pass_item'] = True return FuncField(func=func, *args, **kwargs) return wrapper
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) try: pool = ProxyManager(proxy_url, proxy_headers=headers) except ProxySchemeUnknown: raise GrabMisuseError('Urllib3 transport does ' 'not support %s proxies' % req.proxy_type) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ReadTimeoutError as ex: raise error.GrabTimeoutError('Read timeout') except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('Could not create connection') except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def load(self): """ Load proxies from given list. Each proxy server could be in two forms: * simple: "server:port" * complex: "server:port:user:pwd" """ if not isinstance(self.source, list): raise GrabMisuseError("Given proxy list isn't a list type") self.server_list = self.get_server_list(self.source) self.server_list_iterator = itertools.cycle(self.server_list)
def load(self): """ Load proxies from given string. String can be multiline. Each proxy server could be in two forms: * simple: "server:port" * complex: "server:port:user:pwd" """ if not isinstance(self.source, (str, unicode)): raise GrabMisuseError("Given proxy list isn't a string or unicode type") self.server_list = self.get_server_list(self.source) self.server_list_iterator = itertools.cycle(self.server_list)
def extract_document_data(cls, grab): """ Extract document data from grab object in format that is suitable to pass to `cls` Item constructor. """ sel_type = cls._get_selector_type() if sel_type == 'xpath': return grab.tree elif sel_type == 'json': return grab.response.json else: raise GrabMisuseError('Unknown selector type: %s' % sel_type)
def find_link(self, href_pattern, make_absolute=True): """ Find link in response body which href value matches ``href_pattern``. Returns found url or None. """ if make_absolute: self.tree.make_links_absolute(self.response.url) if isinstance(href_pattern, unicode): raise GrabMisuseError('find_link method accepts only '\ 'byte-string argument') for elem, attr, link, pos in self.tree.iterlinks(): if elem.tag == 'a' and href_pattern in link: return link return None
def find_link(self, href_pattern, make_absolute=True): """ Find link in response body which href value matches ``href_pattern``. Returns found url or None. """ if make_absolute: self.tree.make_links_absolute(self.doc.url) if isinstance(href_pattern, six.text_type): raise GrabMisuseError('Method `find_link` accepts only ' 'byte-string argument') href_pattern = make_unicode(href_pattern) for elem, _, link, _ in self.tree.iterlinks(): if elem.tag == 'a' and href_pattern in link: return link return None
def xpath_list(self, path, filter=None): if filter is not None: raise GrabMisuseError('Argument `filter` is not supported anymore') return self.doc.select(path).node_list()
def xpath_one(self, path, default=NULL, filter=None): if filter is not None: raise GrabMisuseError('Argument `filter` is not supported anymore') return self.doc.select(path).node(default=default)
def select(self, xpath=None): raise GrabMisuseError('TextSelector does not allow select method')
def attr(self, key, default=NULL): raise GrabMisuseError('TextSelector does not allow attr method')
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (six.text_type(ex), grab.config['url'])) # py3 hack if not six.PY3: request_url = make_str(request_url) self.curl.setopt(pycurl.URL, request_url) # 30* redirects are handled by Grab self.curl.setopt(pycurl.FOLLOWLOCATION, 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % grab.request_method) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], six.string_types): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) # py3 hack #if six.PY3: # post_items = decode_pairs(post_items, # grab.config['charset']) self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError( 'Value of post option could be only ' 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) elif grab.request_method == 'OPTIONS': data = grab.config['post'] if data is not None: if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS') else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) # This is required to avoid some problems headers.update({'Expect': ''}) header_tuples = [str('%s: %s' % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config['encoding']: if ('gzip' in grab.config['encoding'] and 'zlib' not in pycurl.version): raise error.GrabMisuseError( 'You can not use gzip encoding because ' 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), make_unicode(grab.config['url'], errors='ignore'))) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.config_body_maxsize = grab.config['body_maxsize'] req.config_nobody = grab.config['nobody'] req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError('Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req.response_file = file_ req.response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req