def _encode_params(self, data): '''Encode parameters in a piece of data. Will successfully encode parameters when passed as a dict or a list of 2-tuples. Order is retained if data is a list of 2-tuples but abritrary if parameters are supplied as a dict. ''' if isinstance(data, basestring): return to_str(data, self._encoding) elif hasattr(data, '__iter__'): result = [] if isinstance(data, dict): items = data.iteritems() else: items = data for k, vs in items: if not hasattr(vs, '__iter__'): vs = [vs] for v in vs: if v is not None: result.append((to_str(k, self._encoding), to_str(v, self._encoding))) return urlencode(result, doseq=True) else: raise TypeError('Bad type for `params` object: %s' % type(data))
def _extract_links_from_html(self, html, response_encoding): links = [] for el, attr, attr_val, pos in html.iterlinks(): if self.tag_func(el.tag): if self.attr_func(attr): try: url = attr_val if isinstance(url, unicode): try: url = to_str(url, response_encoding) except UnicodeEncodeError: # fallback url = to_str(url, 'utf-8') url = requote_url(url) url = correct_relative_path(url) text = el.text or u'' text = to_unicode(text, 'utf-8') nofollow = (el.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', prepend_level=True): '''Adapt Twisted log eventDict making it suitable for logging with a Crawlmi log observer. It may return None to indicate that the event should be ignored by a Crawlmi log observer. `log_level` is the minimum level being logged, and `encoding` is the log encoding. ''' ev = eventDict.copy() if ev['isError']: ev.setdefault('logLevel', ERROR) # ignore non-error messages from outside crawlmi if ev.get('system') != 'crawlmi' and not ev['isError']: return level = ev.get('logLevel') if level < log_level: return lvlname = level_names.get(level, 'NOLEVEL') message = ev.get('message') if message: message = [to_str(x, encoding) for x in message] if prepend_level: message[0] = '%s: %s' % (lvlname, message[0]) ev['message'] = message why = ev.get('why') if why: why = to_str(why, encoding) if prepend_level: why = '%s: %s' % (lvlname, why) ev['why'] = why fmt = ev.get('format') if fmt: fmt = to_str(fmt, encoding) if prepend_level: fmt = '%s: %s' % (lvlname, fmt) ev['format'] = fmt return ev
def __init__(self, response=None, text=None, namespaces=None, _root=None): if text is not None: response = TextResponse(url='about:blank', body=to_str(text)) if response is not None: _root = self._get_root(response) self.namespaces = namespaces self.response = response self._root = _root
def get_meta_refresh(response): '''Parse the http-equiv refrsh parameter from the given HTML response. Return tuple (interval, url).''' text = remove_entities(response.text[0:4096]) text = html_comment_re.sub(u'', text) text = html_noscript_re.sub(u'', text) text = html_script_re.sub(u'', text) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = requote_url(to_str(m.group('url').strip(' "\''), response.encoding)) url = urlparse.urljoin(response.url, url) return (interval, url) else: return (None, None)
def get_meta_refresh(response): '''Parse the http-equiv refrsh parameter from the given HTML response. Return tuple (interval, url).''' text = remove_entities(response.text[0:4096]) text = html_comment_re.sub(u'', text) text = html_noscript_re.sub(u'', text) text = html_script_re.sub(u'', text) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = requote_url( to_str(m.group('url').strip(' "\''), response.encoding)) url = urlparse.urljoin(response.url, url) return (interval, url) else: return (None, None)
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, strip_utm_tags=True, strip_www=False, encoding=None): '''Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are percent-encoded using UTF-8 (RFC-3986) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless keep_blank_values is True) - remove fragments (unless keep_fragments is True) - strip `www.` subdomain (unless strip_www is False) ''' def _strip_tags(keyvals): return filter(lambda (k, v): not _utm_tags_re.match(k), keyvals) if isinstance(url, basestring): url = to_str(url, encoding) else: raise TypeError('Bad type for `url` object: %s' % type(url)) scheme, netloc, path, params, query, fragment = urlparse(url) netloc = netloc.lower() if strip_www: auth, _, domain = netloc.rpartition('@') if domain.startswith('www.'): domain = domain[4:] netloc = '%s@%s' % (auth, domain) if auth else domain keyvals = parse_qsl(query, keep_blank_values) keyvals.sort() if strip_utm_tags: keyvals = _strip_tags(keyvals) query = urllib.urlencode(keyvals) path = _correct_relative_path(path) if not path: path = '/' fragment = '' if not keep_fragments else fragment # sometimes utm tags are inside fragment if fragment and strip_utm_tags: try: parsed_fragment = parse_qsl(fragment, keep_blank_values=True, strict_parsing=True) fragment = urllib.urlencode(_strip_tags(parsed_fragment)) except ValueError: pass return requote_url(urlunparse([scheme, netloc, path, params, query, fragment]))
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding)) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def url_allowed(self, url): url = to_str(url) parsed_url = urlparse(url) allowed = parsed_url.scheme in ['http', 'https', 'file'] # filter mobile and pda sites if allowed and self.filter_mobile: allowed &= not parsed_url.netloc.startswith('m.') allowed &= not parsed_url.netloc.startswith('pda.') if allowed and self.allow_res: allowed &= _matches(url, self.allow_res) if allowed and self.deny_res: allowed &= not _matches(url, self.deny_res) if allowed and self.allow_domains: allowed &= is_url_from_any_domain(parsed_url, self.allow_domains) if allowed and self.deny_domains: allowed &= not is_url_from_any_domain(parsed_url, self.deny_domains) if allowed and self.deny_extensions: allowed &= not has_url_any_extension(parsed_url, self.deny_extensions) return allowed
def _prepare_url(self, url, params): if isinstance(url, basestring): url = to_str(url, self._encoding) else: raise TypeError('Bad type for `url` object: %s' % type(url)) scheme, netloc, path, _params, query, fragment = urlparse(url) if not scheme: raise ValueError('Invalid URL %s: No schema supplied.' % url) if not netloc and not path: raise ValueError('Invalid URL %s: No netloc nor path supplied.' % url) # Bare domains aren't valid URLs. if not path: path = '/' enc_params = self._encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params # ajax excaping if fragment.startswith('!'): fragment = requote_ajax(fragment[1:]) if query: query = '%s&_escaped_fragment_=%s' % (query, fragment) else: query = '_escaped_fragment_=%s' % fragment fragment = '' quoted = requote_url(urlunparse([scheme, netloc, path, _params, query, fragment])) self.parsed_url = urlparse(quoted) return quoted
def _prepare_url(self, url, params): if isinstance(url, basestring): url = to_str(url, self._encoding) else: raise TypeError('Bad type for `url` object: %s' % type(url)) scheme, netloc, path, _params, query, fragment = urlparse(url) if not scheme: raise ValueError('Invalid URL %s: No schema supplied.' % url) if not netloc and not path: raise ValueError('Invalid URL %s: No netloc nor path supplied.' % url) # Bare domains aren't valid URLs. if not path: path = '/' enc_params = self._encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params # ajax excaping if fragment.startswith('!'): fragment = requote_ajax(fragment[1:]) if query: query = '%s&_escaped_fragment_=%s' % (query, fragment) else: query = '_escaped_fragment_=%s' % fragment fragment = '' quoted = requote_url( urlunparse([scheme, netloc, path, _params, query, fragment])) self.parsed_url = urlparse(quoted) return quoted
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url( to_str(to_unicode(l, 'utf-8'), response_encoding)) url = correct_relative_path(url) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format= 'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append( Link(url=url, text=text, nofollow=nofollow)) return links
def _prepare_body(self, body): return to_str(body, self._encoding)
def normkey(self, key): return to_str(key.title(), self.encoding)
def normvalue(self, value): if not hasattr(value, '__iter__'): value = [value] return [to_str(x, self.encoding) for x in value]
def test_to_str(self): self.assertEqual(to_str(u'\xa3 49'), '\xc2\xa3 49') self.assertEqual(to_str(u'\xa3 49', 'latin-1'), '\xa3 49') self.assertEqual(to_str('lel\xf1e'), 'lel\xf1e') self.assertEqual(to_str([10, 11]), '[10, 11]') self.assertIn('?', to_str(u'a\ufffdb', 'latin-1', errors='replace'))