def _urlencode(seq, enc): values = [ (unicode_to_str(k, enc), unicode_to_str(v, enc)) for k, vs in seq for v in (vs if hasattr(vs, "__iter__") else [vs]) ] return urllib.urlencode(values, doseq=1)
def extract_links(self, response): xs = HtmlXPathSelector(response) base_url = xs.select('//base/@href').extract() base_url = unicode_to_str(base_url[0]) if base_url else unicode_to_str(response.url) links = [] for location in self.locations: if isinstance(location, basestring): selectors = xs.select(location) elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)): selectors = [location] if isinstance(location, HtmlXPathSelector) else location else: continue for selector in selectors: links.extend(self.extract_from_selector(selector)) seen, ret = set(), [] for link in links: link.url = urljoin_rfc(base_url, link.url, response.encoding) if self.unique: if link.url in seen: continue else: seen.add(link.url) if self.canonicalize: link.url = canonicalize_url(link.url) ret.append(link) return ret
def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', prepend_level=True): """Adapt Twisted log eventDict making it suitable for logging with a Scrapy log observer. It may return None to indicate that the event should be ignored by a Scrapy log observer. `log_level` is the minimum level being logged, and `encoding` is the log encoding. """ ev = eventDict.copy() if ev['isError']: ev.setdefault('logLevel', ERROR) # ignore non-error messages from outside scrapy if ev.get('system') != 'scrapy' and not ev['isError']: return level = ev.get('logLevel') if level < log_level: return spider = ev.get('spider') if spider: ev['system'] = spider.name message = ev.get('message') lvlname = level_names.get(level, 'NOLEVEL') if message: message = [unicode_to_str(x, encoding) for x in message] if prepend_level: message[0] = "%s: %s" % (lvlname, message[0]) ev['message'] = message why = ev.get('why') if why: why = unicode_to_str(why, encoding) if prepend_level: why = "%s: %s" % (lvlname, why) ev['why'] = why return ev
def urljoin_rfc(base, ref, encoding='utf-8'): """Same as urlparse.urljoin but supports unicode values in base and ref parameters (in which case they will be converted to str using the given encoding). Always returns a str. """ return urlparse.urljoin(unicode_to_str(base, encoding), \ unicode_to_str(ref, encoding))
def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', crawler=None, prepend_level=True): """Adapt Twisted log eventDict making it suitable for logging with a Scrapy log observer. It may return None to indicate that the event should be ignored by a Scrapy log observer. `log_level` is the minimum level being logged, and `encoding` is the log encoding. """ ev = eventDict.copy() if ev['isError']: ev.setdefault('logLevel', ERROR) # ignore non-error messages from outside scrapy if ev.get('system') != 'scrapy' and not ev['isError']: return level = ev.get('logLevel') if level < log_level: return spider = ev.get('spider') if spider: ev['system'] = unicode_to_str(spider.name, encoding) if crawler and (not spider or spider.crawler is not crawler): # ignore events not triggered by own spiders in crawlers' observers return if not crawler and spider: # ignore spiders' events in observers without crawler return lvlname = level_names.get(level, 'NOLEVEL') message = ev.get('message') if message: message = [unicode_to_str(x, encoding) for x in message] if prepend_level: message[0] = "%s: %s" % (lvlname, message[0]) ev['message'] = message why = ev.get('why') if why: why = unicode_to_str(why, encoding) if prepend_level: why = "%s: %s" % (lvlname, why) ev['why'] = why fmt = ev.get('format') if fmt: fmt = unicode_to_str(fmt, encoding) if prepend_level: fmt = "%s: %s" % (lvlname, fmt) ev['format'] = fmt return ev
def test_unicode_to_str(self): # converting a unicode object to an utf-8 encoded string self.assertEqual(unicode_to_str(u'\xa3 49'), '\xc2\xa3 49') # converting a unicode object to a latin-1 encoded string self.assertEqual(unicode_to_str(u'\xa3 49', 'latin-1'), '\xa3 49') # converting a regular string to string should return the same object self.assertEqual(unicode_to_str('lel\xf1e'), 'lel\xf1e') # converting a strange object should raise TypeError self.assertRaises(TypeError, unicode_to_str, unittest) # check errors argument works assert '?' in unicode_to_str(u'a\ufffdb', 'latin-1', errors='replace')
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \ encoding=None): """Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are percent-encoded using UTF-8 (RFC-3986) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless keep_blank_values is True) - remove fragments (unless keep_fragments is True) The url passed can be a str or unicode, while the url returned is always a str. For examples see the tests in scrapy.tests.test_utils_url """ url = unicode_to_str(url, encoding) scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) keyvals = cgi.parse_qsl(query, keep_blank_values) keyvals.sort() query = urllib.urlencode(keyvals) path = safe_url_string(urllib.unquote(path)) fragment = '' if not keep_fragments else fragment return urlparse.urlunparse( (scheme, netloc.lower(), path, params, query, fragment))
def get_log_item(ev, min_level=log.INFO): """Get HubStorage log item for the given Twisted event, or None if no document should be inserted """ if ev['system'] == 'scrapy': level = ev['logLevel'] else: if ev['isError']: level = log.ERROR else: return # ignore non-scrapy & non-error messages if level < min_level: return msg = ev.get('message') if msg: msg = unicode_to_str(msg[0]) failure = ev.get('failure', None) if failure: msg = failure.getTraceback() why = ev.get('why', None) if why: msg = "%s\n%s" % (why, msg) fmt = ev.get('format') if fmt: try: msg = fmt % ev except: msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev) level = log.ERROR msg = msg.replace('\n', '\n\t') # to replicate typical scrapy log appeareance return {'message': msg, 'level': level, 'time': int(time.time()*1000)}
def _umock(result=None, error=None): response = {} if result is not None: response.update(result=result) if error is not None: response.update(error=error) return BytesIO(unicode_to_str(json.dumps(response)))
def parse_url(url, encoding=None): """Return urlparsed url from the given argument (which could be an already parsed url) """ return url if isinstance(url, ParseResult) else \ urlparse(unicode_to_str(url, encoding))
def get_uid(url): """ get the uid of the url algorithm: 1) get 16 bytes (128 bits) md5, encoded by hex 2) split the first 8 bytes and the last 8 bytes 3) convert the two 8 bytes into int 4) XOR the two 8 bytes 5) encode the result by hex """ # convert unicode to str (with encode utf-8) # this function is str safe, without double encode error url = unicode_to_str(url) if isinstance(url, types.StringType): # md5 is a string represents a 32bytes hex number md5 = hashlib.new("md5", url).hexdigest() first_half_bytes = md5[:16] last_half_bytes = md5[16:] # get the two long int first_half_int = int(first_half_bytes, 16) last_half_int = int(last_half_bytes, 16) # XOR the two long int, get a long int xor_int = first_half_int ^ last_half_int # convert to a hex string uid = "%x" % xor_int return uid else: raise Exception('cannot sign a no-string object:%s' % type(url))
def get_uid(url): """ get the uid of the url algorithm: 1) get 16 bytes (128 bits) md5, encoded by hex 2) split the first 8 bytes and the last 8 bytes 3) convert the two 8 bytes into int 4) XOR the two 8 bytes 5) encode the result by hex """ # convert unicode to str (with encode utf-8) # this function is str safe, without double encode error url = unicode_to_str(url) if isinstance(url, types.StringType): # md5 is a string represents a 32bytes hex number md5 = hashlib.new("md5", url).hexdigest() first_half_bytes = md5[:16] last_half_bytes = md5[16:] # get the two long int first_half_int = int(first_half_bytes, 16) last_half_int = int(last_half_bytes, 16) # XOR the two long int, get a long int xor_int = first_half_int ^ last_half_int # convert to a hex string uid = "%x" % xor_int return uid
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \ encoding=None): """Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are percent-encoded using UTF-8 (RFC-3986) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless keep_blank_values is True) - remove fragments (unless keep_fragments is True) The url passed can be a str or unicode, while the url returned is always a str. For examples see the tests in scrapy.tests.test_utils_url """ url = unicode_to_str(url, encoding) scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) keyvals = cgi.parse_qsl(query, keep_blank_values) keyvals.sort() query = urllib.urlencode(keyvals) path = urllib.quote(urllib.unquote(path)) fragment = '' if not keep_fragments else fragment return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
def _adapt_eventdict(eventDict, log_level=INFO, encoding="utf-8", prepend_level=True): """Adapt Twisted log eventDict making it suitable for logging with a Scrapy log observer. It may return None to indicate that the event should be ignored by a Scrapy log observer. `log_level` is the minimum level being logged, and `encoding` is the log encoding. """ ev = eventDict.copy() if ev["isError"]: ev.setdefault("logLevel", ERROR) # ignore non-error messages from outside scrapy if ev.get("system") != "scrapy" and not ev["isError"]: return level = ev.get("logLevel") if level < log_level: return spider = ev.get("spider") if spider: ev["system"] = spider.name lvlname = level_names.get(level, "NOLEVEL") message = ev.get("message") if message: message = [unicode_to_str(x, encoding) for x in message] if prepend_level: message[0] = "%s: %s" % (lvlname, message[0]) ev["message"] = message why = ev.get("why") if why: why = unicode_to_str(why, encoding) if prepend_level: why = "%s: %s" % (lvlname, why) ev["why"] = why fmt = ev.get("format") if fmt: fmt = unicode_to_str(fmt, encoding) if prepend_level: fmt = "%s: %s" % (lvlname, fmt) ev["format"] = fmt return ev
def __init__(self, response=None, text=None, root=None, expr=None, namespaces=None): if text: self.response = TextResponse(url='about:blank', \ body=unicode_to_str(text, 'utf-8'), encoding='utf-8') else: self.response = response self._root = root self._xpathev = None self.namespaces = namespaces self.expr = expr
def __init__(self, *args, **kwargs): formdata = kwargs.pop("formdata", None) super(FormRequest, self).__init__(*args, **kwargs) if formdata: items = formdata.iteritems() if isinstance(formdata, dict) else formdata query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding)) for k, v in items] self.method = "POST" self._set_body(urllib.urlencode(query, doseq=1)) self.headers["Content-Type"] = "application/x-www-form-urlencoded"
def parse_article(self, response): content = unicode_to_str(response.body_as_unicode(),'latin-1','ignore') for block in [content[i:i+1000] for i in range(0, len(content), 1000)]: # Build the datastructure that RNGADDENTROPY requires format = 'ii%is' % len(block) entropy_data = struct.pack(format, 8 * len(block), len(block), block) # Call the RNGADDENTROPY ioctl random_dev_fd = os.open('/dev/random', os.O_WRONLY) ioctl(random_dev_fd, RNDADDENTROPY, entropy_data) os.close(random_dev_fd)
def get_crawl_args(message): """Return the command-line arguments to use for the scrapy crawl process that will be started for this message """ msg = message.copy() args = [unicode_to_str(msg['_spider'])] del msg['_project'], msg['_spider'] for k, v in stringify_dict(msg, keys_only=False).items(): args += ['-a'] args += ['%s=%s' % (k, v)] return args
def __init__(self, response=None, text=None, namespaces=None, _root=None, _expr=None): if text is not None: response = TextResponse(url='about:blank', \ body=unicode_to_str(text, 'utf-8'), encoding='utf-8') if response is not None: _root = LxmlDocument(response, self._parser) self.namespaces = namespaces self.response = response self._root = _root self._expr = _expr
def __init__(self, *args, **kwargs): formdata = kwargs.pop('formdata', None) super(FormRequest, self).__init__(*args, **kwargs) if formdata: items = formdata.iteritems() if isinstance(formdata, dict) else formdata query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding)) for k, v in items] self.method = 'POST' self.body = urllib.urlencode(query, doseq=1) self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
def __init__(self, *args, **kwargs): formdata = kwargs.pop('formdata', None) super(FormRequest, self).__init__(*args, **kwargs) if formdata: items = formdata.iteritems() if isinstance(formdata, dict) else formdata query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding)) for k, v in items] self.method = 'POST' self._set_body(urllib.urlencode(query, doseq=1)) self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
def get_crawl_args_dict(message): """Return arguments dictionary to use for output""" argsDict = {} msg = message.copy() args = [unicode_to_str(msg['_spider'])] del msg['_project'], msg['_spider'] settings = msg.pop('settings', {}) for k, v in stringify_dict(msg, keys_only=False).items(): argsDict[k] = v for k, v in stringify_dict(settings, keys_only=False).items(): argsDict[k] = v return argsDict
def add_sample(source): """ Method for adding samples to test samples file (use from console) """ count = 0 while os.path.exists("%s_%d.json" % (SAMPLES_FILE_PREFIX, count)): count += 1 open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "wb").write(unicode_to_str(source)) parsed = list(parse_html(source)) open("%s_%d.json" % (SAMPLES_FILE_PREFIX, count), "wb")\ .write(json.dumps(parsed, default=_encode_element, indent=8))
def __init__(self, response=None, text=None, node=None, parent=None, expr=None): if parent is not None: self.doc = parent.doc self.xmlNode = node elif response: self.doc = Libxml2Document(response, factory=self._get_libxml2_doc) self.xmlNode = self.doc.xmlDoc elif text: response = TextResponse(url='about:blank', \ body=unicode_to_str(text, 'utf-8'), encoding='utf-8') self.doc = Libxml2Document(response, factory=self._get_libxml2_doc) self.xmlNode = self.doc.xmlDoc self.expr = expr
def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None): """Log message according to the level""" if level > log_level: return if domain is not None: import warnings warnings.warn("'domain' argument of scrapy.log.msg() is deprecated, " \ "use 'spider' argument instead", DeprecationWarning, stacklevel=2) dispatcher.send(signal=logmessage_received, message=message, level=level, \ spider=spider) system = domain or (spider.domain_name if spider else component) msg_txt = unicode_to_str("%s: %s" % (level_names[level], message)) log.msg(msg_txt, system=system)
def err(_stuff=None, _why=None, **kwargs): if ERROR > log_level: return domain = kwargs.pop('domain', None) spider = kwargs.pop('spider', None) component = kwargs.pop('component', BOT_NAME) if domain is not None: import warnings warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \ "use 'spider' argument instead", DeprecationWarning, stacklevel=2) kwargs['system'] = domain or (spider.domain_name if spider else component) if _why: _why = unicode_to_str("ERROR: %s" % _why) log.err(_stuff, _why, **kwargs)
def parse_article(self, response): content = unicode_to_str(response.body_as_unicode(), 'latin-1', 'ignore') for block in [ content[i:i + 1000] for i in range(0, len(content), 1000) ]: # Build the datastructure that RNGADDENTROPY requires format = 'ii%is' % len(block) entropy_data = struct.pack(format, 8 * len(block), len(block), block) # Call the RNGADDENTROPY ioctl random_dev_fd = os.open('/dev/random', os.O_WRONLY) ioctl(random_dev_fd, RNDADDENTROPY, entropy_data) os.close(random_dev_fd)
def safe_url_string(url, encoding='utf8'): """Convert the given url into a legal URL by escaping unsafe characters according to RFC-3986. If a unicode url is given, it is first converted to str using the given encoding (which defaults to 'utf-8'). When passing a encoding, you should use the encoding of the original page (the page from which the url was extracted from). Calling this function on an already "safe" url will return the url unmodified. Always returns a str. """ s = unicode_to_str(url, encoding) return urllib.quote(s, _safe_chars)
def select(self, xpath): if hasattr(self.xmlNode, 'xpathEval'): self.doc.xpathContext.setContextNode(self.xmlNode) xpath = unicode_to_str(xpath, 'utf-8') try: xpath_result = self.doc.xpathContext.xpathEval(xpath) except libxml2.xpathError: raise ValueError("Invalid XPath: %s" % xpath) if hasattr(xpath_result, '__iter__'): return XPathSelectorList([self.__class__(node=node, parent=self, \ expr=xpath) for node in xpath_result]) else: return XPathSelectorList([self.__class__(node=xpath_result, \ parent=self, expr=xpath)]) else: return XPathSelectorList([])
def jsonrpc_client_call(url, method, *args, **kwargs): """Execute a JSON-RPC call on the given url""" if args and kwargs: raise ValueError("Pass *args or **kwargs but not both to jsonrpc_client_call") req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1} data = unicode_to_str(json.dumps(req)) body = urllib.request.urlopen(url, data).read() res = json.loads(body.decode('utf-8')) if 'result' in res: return res['result'] elif 'error' in res: er = res['error'] raise JsonRpcError(er['code'], er['message'], er['data']) else: msg = "JSON-RPC response must contain 'result' or 'error': %s" % res raise ValueError(msg)
def __init__(self, response=None, text=None, node=None, parent=None, expr=None, use_html5lib=False, use_BeautifulSoup=False, namespaces=None): if parent: self.doc = parent.doc self.xmlNode = node elif response: self.xmlNode = self._lxml_parse_document(response.body, use_html5lib, use_BeautifulSoup) self.doc = self.xmlNode.getroottree() elif text: response = TextResponse(url='about:blank', body=unicode_to_str(text), encoding='utf-8') self.xmlNode = self._lxml_parse_document(response.body, use_html5lib, use_BeautifulSoup) self.doc = self.xmlNode.getroottree() self.expr = expr self.namespaces = namespaces or {}
def assertObjectMatch(self, expected, actual, msg=None, keys=None): # {{{ """If key starts with r:, do regex match, else do equal test""" actual_is_dict = hasattr(actual, "__getitem__") def my_type(o): return str(type(o))[7:-2] def parse_key(key): how = None parts = key.split(":") if len(parts) == 2: how, key = parts[0], parts[1] return (how, key) def get_value(obj, key): if actual_is_dict: return obj.get(key, None) else: return getattr(obj, key, None) def check_match(expected, actual, how): if how == "r": # regex match if not expected or not actual: return expected == actual else: return re.search(expected, actual) != None else: return expected == actual keys = keys or expected.keys() err_lines = [] for key in keys: how, actual_key = parse_key(key) ev = expected[key] av = get_value(actual, actual_key) if not check_match(ev, av, how): errmsg = "%s: %s %s != %s %s" % (key, my_type(ev), ev, my_type(av), av) err_lines.append(errmsg) # end for errmsg = "\n".join(err_lines) if msg: errmsg = msg + "\n" + errmsg self.failIf(err_lines, unicode_to_str(errmsg))
def _get_log_item(self, ev): """Get HubStorage log item for the given Twisted event, or None if no document should be inserted """ if ev['system'] == 'scrapy': level = ev['logLevel'] else: if ev['isError']: level = logging.ERROR else: level = logging.INFO # It's important to access level trough handler instance, # min log level can change at any moment. if level < self._hs_loghdlr.level: return msg = ev.get('message') if msg: msg = unicode_to_str(msg[0]) failure = ev.get('failure', None) if failure: msg = failure.getTraceback() why = ev.get('why', None) if why: msg = "%s\n%s" % (why, msg) fmt = ev.get('format') if fmt: try: msg = fmt % ev except: msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev) level = logging.ERROR msg = msg.replace( '\n', '\n\t') # to replicate typical scrapy log appeareance return {'message': msg, 'level': level}
def _get_log_item(self, ev): """Get HubStorage log item for the given Twisted event, or None if no document should be inserted """ if ev['system'] == 'scrapy': level = ev['logLevel'] else: if ev['isError']: level = logging.ERROR else: level = logging.INFO # It's important to access level trough handler instance, # min log level can change at any moment. if level < self._hs_loghdlr.level: return msg = ev.get('message') if msg: msg = unicode_to_str(msg[0]) failure = ev.get('failure', None) if failure: msg = failure.getTraceback() why = ev.get('why', None) if why: msg = "%s\n%s" % (why, msg) fmt = ev.get('format') if fmt: try: msg = fmt % ev except: msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev) level = logging.ERROR msg = msg.replace('\n', '\n\t') # to replicate typical scrapy log appeareance return {'message': msg, 'level': level}
def _add_link(url_sel, alt_sel=None): url = flatten([url_sel.extract()]) alt = flatten([alt_sel.extract()]) if alt_sel else (u'', ) if url: ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
def to_scrapy_response(url, body): return TextResponse(url=url, body=unicode_to_str(body, 'utf-8'), encoding='utf-8')
def _unicode_to_str(string, encoding): if hasattr(string, '__iter__'): return [unicode_to_str(k, encoding) for k in string] else: return unicode_to_str(string, encoding)
def u_to_str(text): unicode_to_str(text,'latin-1','ignore')
def _urlencode(seq, enc): values = [(unicode_to_str(k, enc), unicode_to_str(v, enc)) for k, vs in seq for v in (vs if hasattr(vs, '__iter__') else [vs])] return urllib.urlencode(values, doseq=1)
def unicode_to_gbk(src): return unicode_to_str(src, 'gbk', errors='ignore')
def _response_from_text(text, st): rt = XmlResponse if st == 'xml' else HtmlResponse return rt(url='about:blank', encoding='utf-8', body=unicode_to_str(text, 'utf-8'))
def _response_from_text(text, st): rt = XmlResponse if st == "xml" else HtmlResponse return rt(url="about:blank", encoding="utf-8", body=unicode_to_str(text, "utf-8"))
def _unicode_to_str(string, encoding): if hasattr(string, "__iter__"): return [unicode_to_str(k, encoding) for k in string] else: return unicode_to_str(string, encoding)
def _unicode_to_str(self, eventDict): message = eventDict.get('message') if message: eventDict['message'] = tuple( unicode_to_str(x, self.encoding) for x in message) return eventDict