Exemplo n.º 1
0
    def _encode_params(self, data):
        '''Encode parameters in a piece of data.

        Will successfully encode parameters when passed as a dict or a list of
        2-tuples. Order is retained if data is a list of 2-tuples but abritrary
        if parameters are supplied as a dict.
        '''
        if isinstance(data, basestring):
            return to_str(data, self._encoding)
        elif hasattr(data, '__iter__'):
            result = []
            if isinstance(data, dict):
                items = data.iteritems()
            else:
                items = data
            for k, vs in items:
                if not hasattr(vs, '__iter__'):
                    vs = [vs]
                for v in vs:
                    if v is not None:
                        result.append((to_str(k, self._encoding),
                                       to_str(v, self._encoding)))
            return urlencode(result, doseq=True)
        else:
            raise TypeError('Bad type for `params` object: %s' % type(data))
Exemplo n.º 2
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for el, attr, attr_val, pos in html.iterlinks():
         if self.tag_func(el.tag):
             if self.attr_func(attr):
                 try:
                     url = attr_val
                     if isinstance(url, unicode):
                         try:
                             url = to_str(url, response_encoding)
                         except UnicodeEncodeError:
                             # fallback
                             url = to_str(url, 'utf-8')
                     url = requote_url(url)
                     url = correct_relative_path(url)
                     text = el.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (el.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Exemplo n.º 3
0
    def _encode_params(self, data):
        '''Encode parameters in a piece of data.

        Will successfully encode parameters when passed as a dict or a list of
        2-tuples. Order is retained if data is a list of 2-tuples but abritrary
        if parameters are supplied as a dict.
        '''
        if isinstance(data, basestring):
            return to_str(data, self._encoding)
        elif hasattr(data, '__iter__'):
            result = []
            if isinstance(data, dict):
                items = data.iteritems()
            else:
                items = data
            for k, vs in items:
                if not hasattr(vs, '__iter__'):
                    vs = [vs]
                for v in vs:
                    if v is not None:
                        result.append((to_str(k, self._encoding),
                                       to_str(v, self._encoding)))
            return urlencode(result, doseq=True)
        else:
            raise TypeError('Bad type for `params` object: %s' % type(data))
Exemplo n.º 4
0
def _adapt_eventdict(eventDict,
                     log_level=INFO,
                     encoding='utf-8',
                     prepend_level=True):
    '''Adapt Twisted log eventDict making it suitable for logging with a Crawlmi
    log observer. It may return None to indicate that the event should be
    ignored by a Crawlmi log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    '''
    ev = eventDict.copy()
    if ev['isError']:
        ev.setdefault('logLevel', ERROR)

    # ignore non-error messages from outside crawlmi
    if ev.get('system') != 'crawlmi' and not ev['isError']:
        return

    level = ev.get('logLevel')
    if level < log_level:
        return

    lvlname = level_names.get(level, 'NOLEVEL')
    message = ev.get('message')
    if message:
        message = [to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = '%s: %s' % (lvlname, message[0])
        ev['message'] = message

    why = ev.get('why')
    if why:
        why = to_str(why, encoding)
        if prepend_level:
            why = '%s: %s' % (lvlname, why)
        ev['why'] = why

    fmt = ev.get('format')
    if fmt:
        fmt = to_str(fmt, encoding)
        if prepend_level:
            fmt = '%s: %s' % (lvlname, fmt)
        ev['format'] = fmt

    return ev
Exemplo n.º 5
0
    def __init__(self, response=None, text=None, namespaces=None, _root=None):
        if text is not None:
            response = TextResponse(url='about:blank', body=to_str(text))
        if response is not None:
            _root = self._get_root(response)

        self.namespaces = namespaces
        self.response = response
        self._root = _root
Exemplo n.º 6
0
    def __init__(self, response=None, text=None, namespaces=None, _root=None):
        if text is not None:
            response = TextResponse(url='about:blank', body=to_str(text))
        if response is not None:
            _root = self._get_root(response)

        self.namespaces = namespaces
        self.response = response
        self._root = _root
Exemplo n.º 7
0
def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', prepend_level=True):
    '''Adapt Twisted log eventDict making it suitable for logging with a Crawlmi
    log observer. It may return None to indicate that the event should be
    ignored by a Crawlmi log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    '''
    ev = eventDict.copy()
    if ev['isError']:
        ev.setdefault('logLevel', ERROR)

    # ignore non-error messages from outside crawlmi
    if ev.get('system') != 'crawlmi' and not ev['isError']:
        return

    level = ev.get('logLevel')
    if level < log_level:
        return

    lvlname = level_names.get(level, 'NOLEVEL')
    message = ev.get('message')
    if message:
        message = [to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = '%s: %s' % (lvlname, message[0])
        ev['message'] = message

    why = ev.get('why')
    if why:
        why = to_str(why, encoding)
        if prepend_level:
            why = '%s: %s' % (lvlname, why)
        ev['why'] = why

    fmt = ev.get('format')
    if fmt:
        fmt = to_str(fmt, encoding)
        if prepend_level:
            fmt = '%s: %s' % (lvlname, fmt)
        ev['format'] = fmt

    return ev
Exemplo n.º 8
0
def get_meta_refresh(response):
    '''Parse the http-equiv refrsh parameter from the given HTML response.
    Return tuple (interval, url).'''
    text = remove_entities(response.text[0:4096])
    text = html_comment_re.sub(u'', text)
    text = html_noscript_re.sub(u'', text)
    text = html_script_re.sub(u'', text)

    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = requote_url(to_str(m.group('url').strip(' "\''), response.encoding))
        url = urlparse.urljoin(response.url, url)
        return (interval, url)
    else:
        return (None, None)
Exemplo n.º 9
0
def get_meta_refresh(response):
    '''Parse the http-equiv refrsh parameter from the given HTML response.
    Return tuple (interval, url).'''
    text = remove_entities(response.text[0:4096])
    text = html_comment_re.sub(u'', text)
    text = html_noscript_re.sub(u'', text)
    text = html_script_re.sub(u'', text)

    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = requote_url(
            to_str(m.group('url').strip(' "\''), response.encoding))
        url = urlparse.urljoin(response.url, url)
        return (interval, url)
    else:
        return (None, None)
Exemplo n.º 10
0
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
                     strip_utm_tags=True, strip_www=False, encoding=None):
    '''Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths and query arguments. non-ASCII characters are
      percent-encoded using UTF-8 (RFC-3986)
    - normalize all spaces (in query arguments) '+' (plus symbol)
    - normalize percent encodings case (%2f -> %2F)
    - remove query arguments with blank values (unless keep_blank_values is True)
    - remove fragments (unless keep_fragments is True)
    - strip `www.` subdomain (unless strip_www is False)
    '''
    def _strip_tags(keyvals):
        return filter(lambda (k, v): not _utm_tags_re.match(k), keyvals)

    if isinstance(url, basestring):
        url = to_str(url, encoding)
    else:
        raise TypeError('Bad type for `url` object: %s' % type(url))

    scheme, netloc, path, params, query, fragment = urlparse(url)
    netloc = netloc.lower()
    if strip_www:
        auth, _, domain = netloc.rpartition('@')
        if domain.startswith('www.'):
            domain = domain[4:]
            netloc = '%s@%s' % (auth, domain) if auth else domain

    keyvals = parse_qsl(query, keep_blank_values)
    keyvals.sort()
    if strip_utm_tags:
        keyvals = _strip_tags(keyvals)
    query = urllib.urlencode(keyvals)
    path = _correct_relative_path(path)
    if not path:
        path = '/'
    fragment = '' if not keep_fragments else fragment
    # sometimes utm tags are inside fragment
    if fragment and strip_utm_tags:
        try:
            parsed_fragment = parse_qsl(fragment, keep_blank_values=True, strict_parsing=True)
            fragment = urllib.urlencode(_strip_tags(parsed_fragment))
        except ValueError:
            pass
    return requote_url(urlunparse([scheme, netloc, path, params, query, fragment]))
Exemplo n.º 11
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding))
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Exemplo n.º 12
0
 def url_allowed(self, url):
     url = to_str(url)
     parsed_url = urlparse(url)
     allowed = parsed_url.scheme in ['http', 'https', 'file']
     # filter mobile and pda sites
     if allowed and self.filter_mobile:
         allowed &= not parsed_url.netloc.startswith('m.')
         allowed &= not parsed_url.netloc.startswith('pda.')
     if allowed and self.allow_res:
         allowed &= _matches(url, self.allow_res)
     if allowed and self.deny_res:
         allowed &= not _matches(url, self.deny_res)
     if allowed and self.allow_domains:
         allowed &= is_url_from_any_domain(parsed_url, self.allow_domains)
     if allowed and self.deny_domains:
         allowed &= not is_url_from_any_domain(parsed_url, self.deny_domains)
     if allowed and self.deny_extensions:
         allowed &= not has_url_any_extension(parsed_url, self.deny_extensions)
     return allowed
Exemplo n.º 13
0
    def _prepare_url(self, url, params):
        if isinstance(url, basestring):
            url = to_str(url, self._encoding)
        else:
            raise TypeError('Bad type for `url` object: %s' % type(url))

        scheme, netloc, path, _params, query, fragment = urlparse(url)
        if not scheme:
            raise ValueError('Invalid URL %s: No schema supplied.' % url)
        if not netloc and not path:
            raise ValueError('Invalid URL %s: No netloc nor path supplied.' %
                             url)

        # Bare domains aren't valid URLs.
        if not path:
            path = '/'

        enc_params = self._encode_params(params)
        if enc_params:
            if query:
                query = '%s&%s' % (query, enc_params)
            else:
                query = enc_params

        # ajax excaping
        if fragment.startswith('!'):
            fragment = requote_ajax(fragment[1:])
            if query:
                query = '%s&_escaped_fragment_=%s' % (query, fragment)
            else:
                query = '_escaped_fragment_=%s' % fragment
            fragment = ''

        quoted = requote_url(urlunparse([scheme, netloc, path, _params, query,
                                         fragment]))
        self.parsed_url = urlparse(quoted)
        return quoted
Exemplo n.º 14
0
    def _prepare_url(self, url, params):
        if isinstance(url, basestring):
            url = to_str(url, self._encoding)
        else:
            raise TypeError('Bad type for `url` object: %s' % type(url))

        scheme, netloc, path, _params, query, fragment = urlparse(url)
        if not scheme:
            raise ValueError('Invalid URL %s: No schema supplied.' % url)
        if not netloc and not path:
            raise ValueError('Invalid URL %s: No netloc nor path supplied.' %
                             url)

        # Bare domains aren't valid URLs.
        if not path:
            path = '/'

        enc_params = self._encode_params(params)
        if enc_params:
            if query:
                query = '%s&%s' % (query, enc_params)
            else:
                query = enc_params

        # ajax excaping
        if fragment.startswith('!'):
            fragment = requote_ajax(fragment[1:])
            if query:
                query = '%s&_escaped_fragment_=%s' % (query, fragment)
            else:
                query = '_escaped_fragment_=%s' % fragment
            fragment = ''

        quoted = requote_url(
            urlunparse([scheme, netloc, path, _params, query, fragment]))
        self.parsed_url = urlparse(quoted)
        return quoted
Exemplo n.º 15
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(
                         to_str(to_unicode(l, 'utf-8'), response_encoding))
                     url = correct_relative_path(url)
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format=
                         'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING,
                         url=html.base_url,
                         etype=type(e),
                         error=e)
                 else:
                     links.append(
                         Link(url=url, text=text, nofollow=nofollow))
     return links
Exemplo n.º 16
0
 def _prepare_body(self, body):
     return to_str(body, self._encoding)
Exemplo n.º 17
0
 def normkey(self, key):
     return to_str(key.title(), self.encoding)
Exemplo n.º 18
0
 def normvalue(self, value):
     if not hasattr(value, '__iter__'):
         value = [value]
     return [to_str(x, self.encoding) for x in value]
Exemplo n.º 19
0
 def _prepare_body(self, body):
     return to_str(body, self._encoding)
Exemplo n.º 20
0
 def test_to_str(self):
     self.assertEqual(to_str(u'\xa3 49'), '\xc2\xa3 49')
     self.assertEqual(to_str(u'\xa3 49', 'latin-1'), '\xa3 49')
     self.assertEqual(to_str('lel\xf1e'), 'lel\xf1e')
     self.assertEqual(to_str([10, 11]), '[10, 11]')
     self.assertIn('?', to_str(u'a\ufffdb', 'latin-1', errors='replace'))
Exemplo n.º 21
0
 def normkey(self, key):
     return to_str(key.title(), self.encoding)
Exemplo n.º 22
0
 def normvalue(self, value):
     if not hasattr(value, '__iter__'):
         value = [value]
     return [to_str(x, self.encoding) for x in value]
Exemplo n.º 23
0
 def test_to_str(self):
     self.assertEqual(to_str(u'\xa3 49'), '\xc2\xa3 49')
     self.assertEqual(to_str(u'\xa3 49', 'latin-1'), '\xa3 49')
     self.assertEqual(to_str('lel\xf1e'), 'lel\xf1e')
     self.assertEqual(to_str([10, 11]), '[10, 11]')
     self.assertIn('?', to_str(u'a\ufffdb', 'latin-1', errors='replace'))