Пример #1
0
def _urlencode(seq, enc):
    values = [
        (unicode_to_str(k, enc), unicode_to_str(v, enc))
        for k, vs in seq
        for v in (vs if hasattr(vs, "__iter__") else [vs])
    ]
    return urllib.urlencode(values, doseq=1)
Пример #2
0
    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = unicode_to_str(base_url[0]) if base_url else unicode_to_str(response.url)

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret
Пример #3
0
def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', prepend_level=True):
    """Adapt Twisted log eventDict making it suitable for logging with a Scrapy
    log observer. It may return None to indicate that the event should be
    ignored by a Scrapy log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    """
    ev = eventDict.copy()
    if ev['isError']:
        ev.setdefault('logLevel', ERROR)
    # ignore non-error messages from outside scrapy
    if ev.get('system') != 'scrapy' and not ev['isError']:
        return
    level = ev.get('logLevel')
    if level < log_level:
        return
    spider = ev.get('spider')
    if spider:
        ev['system'] = spider.name
    message = ev.get('message')
    lvlname = level_names.get(level, 'NOLEVEL')
    if message:
        message = [unicode_to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = "%s: %s" % (lvlname, message[0])
    ev['message'] = message
    why = ev.get('why')
    if why:
        why = unicode_to_str(why, encoding)
        if prepend_level:
            why = "%s: %s" % (lvlname, why)
    ev['why'] = why
    return ev
Пример #4
0
def urljoin_rfc(base, ref, encoding='utf-8'):
    """Same as urlparse.urljoin but supports unicode values in base and ref
    parameters (in which case they will be converted to str using the given
    encoding).

    Always returns a str.
    """
    return urlparse.urljoin(unicode_to_str(base, encoding), \
        unicode_to_str(ref, encoding))
Пример #5
0
def _adapt_eventdict(eventDict,
                     log_level=INFO,
                     encoding='utf-8',
                     crawler=None,
                     prepend_level=True):
    """Adapt Twisted log eventDict making it suitable for logging with a Scrapy
    log observer. It may return None to indicate that the event should be
    ignored by a Scrapy log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    """
    ev = eventDict.copy()
    if ev['isError']:
        ev.setdefault('logLevel', ERROR)

    # ignore non-error messages from outside scrapy
    if ev.get('system') != 'scrapy' and not ev['isError']:
        return

    level = ev.get('logLevel')
    if level < log_level:
        return

    spider = ev.get('spider')
    if spider:
        ev['system'] = unicode_to_str(spider.name, encoding)
    if crawler and (not spider or spider.crawler is not crawler):
        # ignore events not triggered by own spiders in crawlers' observers
        return
    if not crawler and spider:
        # ignore spiders' events in observers without crawler
        return

    lvlname = level_names.get(level, 'NOLEVEL')
    message = ev.get('message')
    if message:
        message = [unicode_to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = "%s: %s" % (lvlname, message[0])
        ev['message'] = message

    why = ev.get('why')
    if why:
        why = unicode_to_str(why, encoding)
        if prepend_level:
            why = "%s: %s" % (lvlname, why)
        ev['why'] = why

    fmt = ev.get('format')
    if fmt:
        fmt = unicode_to_str(fmt, encoding)
        if prepend_level:
            fmt = "%s: %s" % (lvlname, fmt)
        ev['format'] = fmt

    return ev
Пример #6
0
def urljoin_rfc(base, ref, encoding='utf-8'):
    """Same as urlparse.urljoin but supports unicode values in base and ref
    parameters (in which case they will be converted to str using the given
    encoding).

    Always returns a str.
    """
    return urlparse.urljoin(unicode_to_str(base, encoding), \
        unicode_to_str(ref, encoding))
Пример #7
0
    def test_unicode_to_str(self):
        # converting a unicode object to an utf-8 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49'), '\xc2\xa3 49')

        # converting a unicode object to a latin-1 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49', 'latin-1'), '\xa3 49')

        # converting a regular string to string should return the same object
        self.assertEqual(unicode_to_str('lel\xf1e'), 'lel\xf1e')

        # converting a strange object should raise TypeError
        self.assertRaises(TypeError, unicode_to_str, unittest)

        # check errors argument works
        assert '?' in unicode_to_str(u'a\ufffdb', 'latin-1', errors='replace')
Пример #8
0
    def test_unicode_to_str(self):
        # converting a unicode object to an utf-8 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49'), '\xc2\xa3 49')

        # converting a unicode object to a latin-1 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49', 'latin-1'), '\xa3 49')

        # converting a regular string to string should return the same object
        self.assertEqual(unicode_to_str('lel\xf1e'), 'lel\xf1e')

        # converting a strange object should raise TypeError
        self.assertRaises(TypeError, unicode_to_str, unittest)

        # check errors argument works
        assert '?' in unicode_to_str(u'a\ufffdb', 'latin-1', errors='replace')
Пример #9
0
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
        encoding=None):
    """Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths and query arguments. non-ASCII characters are
      percent-encoded using UTF-8 (RFC-3986)
    - normalize all spaces (in query arguments) '+' (plus symbol)
    - normalize percent encodings case (%2f -> %2F)
    - remove query arguments with blank values (unless keep_blank_values is True)
    - remove fragments (unless keep_fragments is True)

    The url passed can be a str or unicode, while the url returned is always a
    str.

    For examples see the tests in scrapy.tests.test_utils_url
    """

    url = unicode_to_str(url, encoding)
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
    keyvals = cgi.parse_qsl(query, keep_blank_values)
    keyvals.sort()
    query = urllib.urlencode(keyvals)
    path = safe_url_string(urllib.unquote(path))
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse(
        (scheme, netloc.lower(), path, params, query, fragment))
Пример #10
0
def get_log_item(ev, min_level=log.INFO):
    """Get HubStorage log item for the given Twisted event, or None if no
    document should be inserted
    """
    if ev['system'] == 'scrapy':
        level = ev['logLevel']
    else:
        if ev['isError']:
            level = log.ERROR
        else:
            return # ignore non-scrapy & non-error messages
    if level < min_level:
        return
    msg = ev.get('message')
    if msg:
        msg = unicode_to_str(msg[0])
    failure = ev.get('failure', None)
    if failure:
        msg = failure.getTraceback()
    why = ev.get('why', None)
    if why:
        msg = "%s\n%s" % (why, msg)
    fmt = ev.get('format')
    if fmt:
        try:
            msg = fmt % ev
        except:
            msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev)
            level = log.ERROR
    msg = msg.replace('\n', '\n\t') # to replicate typical scrapy log appeareance
    return {'message': msg, 'level': level, 'time': int(time.time()*1000)}
Пример #11
0
def _umock(result=None, error=None):
    response = {}
    if result is not None:
        response.update(result=result)
    if error is not None:
        response.update(error=error)
    return BytesIO(unicode_to_str(json.dumps(response)))
Пример #12
0
def _umock(result=None, error=None):
    response = {}
    if result is not None:
        response.update(result=result)
    if error is not None:
        response.update(error=error)
    return BytesIO(unicode_to_str(json.dumps(response)))
def parse_url(url, encoding=None):

    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    return url if isinstance(url, ParseResult) else \
        urlparse(unicode_to_str(url, encoding))
Пример #14
0
def get_uid(url):
    """
        get the uid of the url
        algorithm:
        1) get 16 bytes (128 bits) md5, encoded by hex
        2) split the first 8 bytes and the last 8 bytes
        3) convert the two 8 bytes into int
        4) XOR the two 8 bytes
        5) encode the result by hex
    """
    # convert unicode to str (with encode utf-8)
    # this function is str safe, without double encode error
    url = unicode_to_str(url)
    if isinstance(url, types.StringType):
        # md5 is a string represents a 32bytes hex number
        md5 = hashlib.new("md5", url).hexdigest()
        first_half_bytes = md5[:16]
        last_half_bytes = md5[16:]

        # get the two long int
        first_half_int = int(first_half_bytes, 16)
        last_half_int = int(last_half_bytes, 16)

        # XOR the two long int, get a long int
        xor_int = first_half_int ^ last_half_int

        # convert to a hex string
        uid = "%x" % xor_int

        return uid
    else:
        raise Exception('cannot sign a no-string object:%s' % type(url))
Пример #15
0
def get_uid(url):
    """
        get the uid of the url
        algorithm:
        1) get 16 bytes (128 bits) md5, encoded by hex
        2) split the first 8 bytes and the last 8 bytes
        3) convert the two 8 bytes into int
        4) XOR the two 8 bytes
        5) encode the result by hex
    """
    # convert unicode to str (with encode utf-8)
    # this function is str safe, without double encode error
    url = unicode_to_str(url)

    if isinstance(url, types.StringType):
        # md5 is a string represents a 32bytes hex number
        md5 = hashlib.new("md5", url).hexdigest()
        first_half_bytes = md5[:16]
        last_half_bytes = md5[16:]

        # get the two long int
        first_half_int = int(first_half_bytes, 16)
        last_half_int = int(last_half_bytes, 16)

        # XOR the two long int, get a long int
        xor_int = first_half_int ^ last_half_int

        # convert to a hex string
        uid = "%x" % xor_int

        return uid
Пример #16
0
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
        encoding=None):
    """Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths and query arguments. non-ASCII characters are
      percent-encoded using UTF-8 (RFC-3986)
    - normalize all spaces (in query arguments) '+' (plus symbol)
    - normalize percent encodings case (%2f -> %2F)
    - remove query arguments with blank values (unless keep_blank_values is True)
    - remove fragments (unless keep_fragments is True)

    The url passed can be a str or unicode, while the url returned is always a
    str.

    For examples see the tests in scrapy.tests.test_utils_url
    """

    url = unicode_to_str(url, encoding)
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
    keyvals = cgi.parse_qsl(query, keep_blank_values)
    keyvals.sort()
    query = urllib.urlencode(keyvals)
    path = urllib.quote(urllib.unquote(path))
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
Пример #17
0
def _adapt_eventdict(eventDict, log_level=INFO, encoding="utf-8", prepend_level=True):
    """Adapt Twisted log eventDict making it suitable for logging with a Scrapy
    log observer. It may return None to indicate that the event should be
    ignored by a Scrapy log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    """
    ev = eventDict.copy()
    if ev["isError"]:
        ev.setdefault("logLevel", ERROR)

    # ignore non-error messages from outside scrapy
    if ev.get("system") != "scrapy" and not ev["isError"]:
        return

    level = ev.get("logLevel")
    if level < log_level:
        return

    spider = ev.get("spider")
    if spider:
        ev["system"] = spider.name

    lvlname = level_names.get(level, "NOLEVEL")
    message = ev.get("message")
    if message:
        message = [unicode_to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = "%s: %s" % (lvlname, message[0])
        ev["message"] = message

    why = ev.get("why")
    if why:
        why = unicode_to_str(why, encoding)
        if prepend_level:
            why = "%s: %s" % (lvlname, why)
        ev["why"] = why

    fmt = ev.get("format")
    if fmt:
        fmt = unicode_to_str(fmt, encoding)
        if prepend_level:
            fmt = "%s: %s" % (lvlname, fmt)
        ev["format"] = fmt

    return ev
Пример #18
0
 def __init__(self, response=None, text=None, root=None, expr=None, namespaces=None):
     if text:
         self.response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
     else:
         self.response = response
     self._root = root
     self._xpathev = None
     self.namespaces = namespaces
     self.expr = expr
Пример #19
0
    def __init__(self, *args, **kwargs):
        formdata = kwargs.pop("formdata", None)
        super(FormRequest, self).__init__(*args, **kwargs)

        if formdata:
            items = formdata.iteritems() if isinstance(formdata, dict) else formdata
            query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding)) for k, v in items]
            self.method = "POST"
            self._set_body(urllib.urlencode(query, doseq=1))
            self.headers["Content-Type"] = "application/x-www-form-urlencoded"
Пример #20
0
 def parse_article(self, response):
   content = unicode_to_str(response.body_as_unicode(),'latin-1','ignore')
   for block in [content[i:i+1000] for i in range(0, len(content), 1000)]:
     # Build the datastructure that RNGADDENTROPY requires
     format = 'ii%is' % len(block)
     entropy_data = struct.pack(format, 8 * len(block), len(block), block)
     # Call the RNGADDENTROPY ioctl
     random_dev_fd = os.open('/dev/random', os.O_WRONLY)
     ioctl(random_dev_fd, RNDADDENTROPY, entropy_data)
     os.close(random_dev_fd)
Пример #21
0
def get_crawl_args(message):
    """Return the command-line arguments to use for the scrapy crawl process
    that will be started for this message
    """
    msg = message.copy()
    args = [unicode_to_str(msg['_spider'])]
    del msg['_project'], msg['_spider']
    for k, v in stringify_dict(msg, keys_only=False).items():
        args += ['-a']
        args += ['%s=%s' % (k, v)]
    return args
Пример #22
0
    def __init__(self, response=None, text=None, namespaces=None, _root=None, _expr=None):
        if text is not None:
            response = TextResponse(url='about:blank', \
                body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
        if response is not None:
            _root = LxmlDocument(response, self._parser)

        self.namespaces = namespaces
        self.response = response
        self._root = _root
        self._expr = _expr
Пример #23
0
def get_crawl_args(message):
    """Return the command-line arguments to use for the scrapy crawl process
    that will be started for this message
    """
    msg = message.copy()
    args = [unicode_to_str(msg['_spider'])]
    del msg['_project'], msg['_spider']
    for k, v in stringify_dict(msg, keys_only=False).items():
        args += ['-a']
        args += ['%s=%s' % (k, v)]
    return args
Пример #24
0
    def __init__(self, *args, **kwargs):
        formdata = kwargs.pop('formdata', None)
        super(FormRequest, self).__init__(*args, **kwargs)

        if formdata:
            items = formdata.iteritems() if isinstance(formdata, dict) else formdata
            query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding))
                    for k, v in items]
            self.method = 'POST'
            self.body = urllib.urlencode(query, doseq=1)
            self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
Пример #25
0
    def __init__(self, response=None, text=None, namespaces=None, _root=None, _expr=None):
        if text is not None:
            response = TextResponse(url='about:blank', \
                body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
        if response is not None:
            _root = LxmlDocument(response, self._parser)

        self.namespaces = namespaces
        self.response = response
        self._root = _root
        self._expr = _expr
Пример #26
0
    def __init__(self, *args, **kwargs):
        formdata = kwargs.pop('formdata', None)
        super(FormRequest, self).__init__(*args, **kwargs)

        if formdata:
            items = formdata.iteritems() if isinstance(formdata,
                                                       dict) else formdata
            query = [(unicode_to_str(k, self.encoding),
                      _unicode_to_str(v, self.encoding)) for k, v in items]
            self.method = 'POST'
            self._set_body(urllib.urlencode(query, doseq=1))
            self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
Пример #27
0
def get_crawl_args_dict(message):
    """Return arguments dictionary to use for output"""
    argsDict = {}
    msg = message.copy()
    args = [unicode_to_str(msg['_spider'])]
    del msg['_project'], msg['_spider']
    settings = msg.pop('settings', {})
    for k, v in stringify_dict(msg, keys_only=False).items():
        argsDict[k] = v
    for k, v in stringify_dict(settings, keys_only=False).items():
        argsDict[k] = v
    return argsDict
Пример #28
0
def add_sample(source):
    """
    Method for adding samples to test samples file
    (use from console)
    """
    count = 0
    while os.path.exists("%s_%d.json" % (SAMPLES_FILE_PREFIX, count)):
        count += 1
    
    open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "wb").write(unicode_to_str(source))
    parsed = list(parse_html(source))
    open("%s_%d.json" % (SAMPLES_FILE_PREFIX, count), "wb")\
        .write(json.dumps(parsed, default=_encode_element, indent=8))
Пример #29
0
 def __init__(self, response=None, text=None, node=None, parent=None, expr=None):
     if parent is not None:
         self.doc = parent.doc
         self.xmlNode = node
     elif response:
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     elif text:
         response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     self.expr = expr
Пример #30
0
def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None):
    """Log message according to the level"""
    if level > log_level:
        return
    if domain is not None:
        import warnings
        warnings.warn("'domain' argument of scrapy.log.msg() is deprecated, " \
            "use 'spider' argument instead", DeprecationWarning, stacklevel=2)
    dispatcher.send(signal=logmessage_received, message=message, level=level, \
        spider=spider)
    system = domain or (spider.domain_name if spider else component)
    msg_txt = unicode_to_str("%s: %s" % (level_names[level], message))
    log.msg(msg_txt, system=system)
Пример #31
0
def err(_stuff=None, _why=None, **kwargs):
    if ERROR > log_level:
        return
    domain = kwargs.pop('domain', None)
    spider = kwargs.pop('spider', None)
    component = kwargs.pop('component', BOT_NAME)
    if domain is not None:
        import warnings
        warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \
            "use 'spider' argument instead", DeprecationWarning, stacklevel=2)
    kwargs['system'] = domain or (spider.domain_name if spider else component)
    if _why:
        _why = unicode_to_str("ERROR: %s" % _why)
    log.err(_stuff, _why, **kwargs)
Пример #32
0
 def parse_article(self, response):
     content = unicode_to_str(response.body_as_unicode(), 'latin-1',
                              'ignore')
     for block in [
             content[i:i + 1000] for i in range(0, len(content), 1000)
     ]:
         # Build the datastructure that RNGADDENTROPY requires
         format = 'ii%is' % len(block)
         entropy_data = struct.pack(format, 8 * len(block), len(block),
                                    block)
         # Call the RNGADDENTROPY ioctl
         random_dev_fd = os.open('/dev/random', os.O_WRONLY)
         ioctl(random_dev_fd, RNDADDENTROPY, entropy_data)
         os.close(random_dev_fd)
Пример #33
0
 def __init__(self,
              response=None,
              text=None,
              root=None,
              expr=None,
              namespaces=None):
     if text:
         self.response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
     else:
         self.response = response
     self._root = root
     self._xpathev = None
     self.namespaces = namespaces
     self.expr = expr
Пример #34
0
def safe_url_string(url, encoding='utf8'):
    """Convert the given url into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a unicode url is given, it is first converted to str using the given
    encoding (which defaults to 'utf-8'). When passing a encoding, you should
    use the encoding of the original page (the page from which the url was
    extracted from).

    Calling this function on an already "safe" url will return the url
    unmodified.

    Always returns a str.
    """
    s = unicode_to_str(url, encoding)
    return urllib.quote(s,  _safe_chars)
Пример #35
0
 def select(self, xpath):
     if hasattr(self.xmlNode, 'xpathEval'):
         self.doc.xpathContext.setContextNode(self.xmlNode)
         xpath = unicode_to_str(xpath, 'utf-8')
         try:
             xpath_result = self.doc.xpathContext.xpathEval(xpath)
         except libxml2.xpathError:
             raise ValueError("Invalid XPath: %s" % xpath)
         if hasattr(xpath_result, '__iter__'):
             return XPathSelectorList([self.__class__(node=node, parent=self, \
                 expr=xpath) for node in xpath_result])
         else:
             return XPathSelectorList([self.__class__(node=xpath_result, \
                 parent=self, expr=xpath)])
     else:
         return XPathSelectorList([])
Пример #36
0
def safe_url_string(url, encoding='utf8'):
    """Convert the given url into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a unicode url is given, it is first converted to str using the given
    encoding (which defaults to 'utf-8'). When passing a encoding, you should
    use the encoding of the original page (the page from which the url was
    extracted from).

    Calling this function on an already "safe" url will return the url
    unmodified.

    Always returns a str.
    """
    s = unicode_to_str(url, encoding)
    return urllib.quote(s, _safe_chars)
Пример #37
0
def jsonrpc_client_call(url, method, *args, **kwargs):
    """Execute a JSON-RPC call on the given url"""
    if args and kwargs:
        raise ValueError("Pass *args or **kwargs but not both to jsonrpc_client_call")
    req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1}
    data = unicode_to_str(json.dumps(req))
    body = urllib.request.urlopen(url, data).read()
    res = json.loads(body.decode('utf-8'))
    if 'result' in res:
        return res['result']
    elif 'error' in res:
        er = res['error']
        raise JsonRpcError(er['code'], er['message'], er['data'])
    else:
        msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
        raise ValueError(msg)
Пример #38
0
 def select(self, xpath):
     if hasattr(self.xmlNode, 'xpathEval'):
         self.doc.xpathContext.setContextNode(self.xmlNode)
         xpath = unicode_to_str(xpath, 'utf-8')
         try:
             xpath_result = self.doc.xpathContext.xpathEval(xpath)
         except libxml2.xpathError:
             raise ValueError("Invalid XPath: %s" % xpath)
         if hasattr(xpath_result, '__iter__'):
             return XPathSelectorList([self.__class__(node=node, parent=self, \
                 expr=xpath) for node in xpath_result])
         else:
             return XPathSelectorList([self.__class__(node=xpath_result, \
                 parent=self, expr=xpath)])
     else:
         return XPathSelectorList([])
Пример #39
0
 def __init__(self, response=None, text=None, node=None, parent=None, expr=None,
              use_html5lib=False, use_BeautifulSoup=False, namespaces=None):
     if parent:
         self.doc = parent.doc
         self.xmlNode = node
     elif response:
         self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,
                                                  use_BeautifulSoup)
         self.doc = self.xmlNode.getroottree()
     elif text:
         response = TextResponse(url='about:blank', body=unicode_to_str(text),
                                 encoding='utf-8')
         self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,
                                                  use_BeautifulSoup)
         self.doc = self.xmlNode.getroottree()
     self.expr = expr
     self.namespaces = namespaces or {}
Пример #40
0
 def __init__(self,
              response=None,
              text=None,
              node=None,
              parent=None,
              expr=None):
     if parent is not None:
         self.doc = parent.doc
         self.xmlNode = node
     elif response:
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     elif text:
         response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     self.expr = expr
Пример #41
0
    def assertObjectMatch(self, expected, actual, msg=None, keys=None):  # {{{
        """If key starts with r:, do regex match, else do equal test"""
        actual_is_dict = hasattr(actual, "__getitem__")

        def my_type(o):
            return str(type(o))[7:-2]

        def parse_key(key):
            how = None
            parts = key.split(":")
            if len(parts) == 2:
                how, key = parts[0], parts[1]
            return (how, key)

        def get_value(obj, key):
            if actual_is_dict:
                return obj.get(key, None)
            else:
                return getattr(obj, key, None)

        def check_match(expected, actual, how):
            if how == "r":  # regex match
                if not expected or not actual:
                    return expected == actual
                else:
                    return re.search(expected, actual) != None
            else:
                return expected == actual

        keys = keys or expected.keys()
        err_lines = []
        for key in keys:
            how, actual_key = parse_key(key)
            ev = expected[key]
            av = get_value(actual, actual_key)
            if not check_match(ev, av, how):
                errmsg = "%s: %s %s != %s %s" % (key, my_type(ev), ev, my_type(av), av)
                err_lines.append(errmsg)
        # end for

        errmsg = "\n".join(err_lines)
        if msg:
            errmsg = msg + "\n" + errmsg
        self.failIf(err_lines, unicode_to_str(errmsg))
Пример #42
0
    def _get_log_item(self, ev):
        """Get HubStorage log item for the given Twisted event, or None if no
        document should be inserted
        """
        if ev['system'] == 'scrapy':
            level = ev['logLevel']
        else:
            if ev['isError']:
                level = logging.ERROR
            else:
                level = logging.INFO

        # It's important to access level trough handler instance,
        # min log level can change at any moment.
        if level < self._hs_loghdlr.level:
            return

        msg = ev.get('message')
        if msg:
            msg = unicode_to_str(msg[0])

        failure = ev.get('failure', None)
        if failure:
            msg = failure.getTraceback()

        why = ev.get('why', None)
        if why:
            msg = "%s\n%s" % (why, msg)

        fmt = ev.get('format')
        if fmt:
            try:
                msg = fmt % ev
            except:
                msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev)
                level = logging.ERROR

        msg = msg.replace(
            '\n', '\n\t')  # to replicate typical scrapy log appeareance
        return {'message': msg, 'level': level}
Пример #43
0
    def _get_log_item(self, ev):
        """Get HubStorage log item for the given Twisted event, or None if no
        document should be inserted
        """
        if ev['system'] == 'scrapy':
            level = ev['logLevel']
        else:
            if ev['isError']:
                level = logging.ERROR
            else:
                level = logging.INFO

        # It's important to access level trough handler instance,
        # min log level can change at any moment.
        if level < self._hs_loghdlr.level:
            return

        msg = ev.get('message')
        if msg:
            msg = unicode_to_str(msg[0])

        failure = ev.get('failure', None)
        if failure:
            msg = failure.getTraceback()

        why = ev.get('why', None)
        if why:
            msg = "%s\n%s" % (why, msg)

        fmt = ev.get('format')
        if fmt:
            try:
                msg = fmt % ev
            except:
                msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev)
                level = logging.ERROR

        msg = msg.replace('\n', '\n\t')  # to replicate typical scrapy log appeareance
        return {'message': msg, 'level': level}
Пример #44
0
 def _add_link(url_sel, alt_sel=None):
     url = flatten([url_sel.extract()])
     alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
     if url:
         ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
Пример #45
0
def to_scrapy_response(url, body):
    return TextResponse(url=url,
                        body=unicode_to_str(body, 'utf-8'),
                        encoding='utf-8')
Пример #46
0
def _unicode_to_str(string, encoding):
    if hasattr(string, '__iter__'):
        return [unicode_to_str(k, encoding) for k in string]
    else:
        return unicode_to_str(string, encoding)
Пример #47
0
def u_to_str(text):
    unicode_to_str(text,'latin-1','ignore')
Пример #48
0
def _urlencode(seq, enc):
    values = [(unicode_to_str(k, enc), unicode_to_str(v, enc)) for k, vs in seq
              for v in (vs if hasattr(vs, '__iter__') else [vs])]
    return urllib.urlencode(values, doseq=1)
Пример #49
0
def unicode_to_gbk(src):
    return unicode_to_str(src, 'gbk', errors='ignore')
Пример #50
0
def _response_from_text(text, st):
    rt = XmlResponse if st == 'xml' else HtmlResponse
    return rt(url='about:blank', encoding='utf-8',
              body=unicode_to_str(text, 'utf-8'))
Пример #51
0
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    return url if isinstance(url, ParseResult) else \
        urlparse(unicode_to_str(url, encoding))
Пример #52
0
def _response_from_text(text, st):
    rt = XmlResponse if st == "xml" else HtmlResponse
    return rt(url="about:blank", encoding="utf-8", body=unicode_to_str(text, "utf-8"))
Пример #53
0
def _unicode_to_str(string, encoding):
    if hasattr(string, "__iter__"):
        return [unicode_to_str(k, encoding) for k in string]
    else:
        return unicode_to_str(string, encoding)
Пример #54
0
 def _unicode_to_str(self, eventDict):
     message = eventDict.get('message')
     if message:
         eventDict['message'] = tuple(
             unicode_to_str(x, self.encoding) for x in message)
     return eventDict