Python unicode_to_str примеры, scrapy.utils.python.unicode_to_str Python примеры использования

Пример #1

0

Показать файл

Файл: form.py Проект: myli-cn/scrapy

def _urlencode(seq, enc):
    values = [
        (unicode_to_str(k, enc), unicode_to_str(v, enc))
        for k, vs in seq
        for v in (vs if hasattr(vs, "__iter__") else [vs])
    ]
    return urllib.urlencode(values, doseq=1)

Пример #2

0

Показать файл

Файл: image.py Проект: serkanh/scrapy

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = unicode_to_str(base_url[0]) if base_url else unicode_to_str(response.url)

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret

Пример #3

0

Показать файл

Файл: log.py Проект: dreamfrog/jophiel

def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', prepend_level=True):
    """Adapt Twisted log eventDict making it suitable for logging with a Scrapy
    log observer. It may return None to indicate that the event should be
    ignored by a Scrapy log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    """
    ev = eventDict.copy()
    if ev['isError']:
        ev.setdefault('logLevel', ERROR)
    # ignore non-error messages from outside scrapy
    if ev.get('system') != 'scrapy' and not ev['isError']:
        return
    level = ev.get('logLevel')
    if level < log_level:
        return
    spider = ev.get('spider')
    if spider:
        ev['system'] = spider.name
    message = ev.get('message')
    lvlname = level_names.get(level, 'NOLEVEL')
    if message:
        message = [unicode_to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = "%s: %s" % (lvlname, message[0])
    ev['message'] = message
    why = ev.get('why')
    if why:
        why = unicode_to_str(why, encoding)
        if prepend_level:
            why = "%s: %s" % (lvlname, why)
    ev['why'] = why
    return ev

Пример #4

0

Показать файл

def urljoin_rfc(base, ref, encoding='utf-8'):
    """Same as urlparse.urljoin but supports unicode values in base and ref
    parameters (in which case they will be converted to str using the given
    encoding).

    Always returns a str.
    """
    return urlparse.urljoin(unicode_to_str(base, encoding), \
        unicode_to_str(ref, encoding))

Пример #5

0

Показать файл

Файл: log.py Проект: zxsted/scrapy

def _adapt_eventdict(eventDict,
                     log_level=INFO,
                     encoding='utf-8',
                     crawler=None,
                     prepend_level=True):
    """Adapt Twisted log eventDict making it suitable for logging with a Scrapy
    log observer. It may return None to indicate that the event should be
    ignored by a Scrapy log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    """
    ev = eventDict.copy()
    if ev['isError']:
        ev.setdefault('logLevel', ERROR)

    # ignore non-error messages from outside scrapy
    if ev.get('system') != 'scrapy' and not ev['isError']:
        return

    level = ev.get('logLevel')
    if level < log_level:
        return

    spider = ev.get('spider')
    if spider:
        ev['system'] = unicode_to_str(spider.name, encoding)
    if crawler and (not spider or spider.crawler is not crawler):
        # ignore events not triggered by own spiders in crawlers' observers
        return
    if not crawler and spider:
        # ignore spiders' events in observers without crawler
        return

    lvlname = level_names.get(level, 'NOLEVEL')
    message = ev.get('message')
    if message:
        message = [unicode_to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = "%s: %s" % (lvlname, message[0])
        ev['message'] = message

    why = ev.get('why')
    if why:
        why = unicode_to_str(why, encoding)
        if prepend_level:
            why = "%s: %s" % (lvlname, why)
        ev['why'] = why

    fmt = ev.get('format')
    if fmt:
        fmt = unicode_to_str(fmt, encoding)
        if prepend_level:
            fmt = "%s: %s" % (lvlname, fmt)
        ev['format'] = fmt

    return ev

Пример #6

0

Показать файл

Файл: url.py Проект: kenzouyeh/scrapy

def urljoin_rfc(base, ref, encoding='utf-8'):
    """Same as urlparse.urljoin but supports unicode values in base and ref
    parameters (in which case they will be converted to str using the given
    encoding).

    Always returns a str.
    """
    return urlparse.urljoin(unicode_to_str(base, encoding), \
        unicode_to_str(ref, encoding))

Пример #7

0

Показать файл

Файл: test_utils_python.py Проект: Aaron1011/oh-mainline

    def test_unicode_to_str(self):
        # converting a unicode object to an utf-8 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49'), '\xc2\xa3 49')

        # converting a unicode object to a latin-1 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49', 'latin-1'), '\xa3 49')

        # converting a regular string to string should return the same object
        self.assertEqual(unicode_to_str('lel\xf1e'), 'lel\xf1e')

        # converting a strange object should raise TypeError
        self.assertRaises(TypeError, unicode_to_str, unittest)

        # check errors argument works
        assert '?' in unicode_to_str(u'a\ufffdb', 'latin-1', errors='replace')

Пример #8

0

Показать файл

    def test_unicode_to_str(self):
        # converting a unicode object to an utf-8 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49'), '\xc2\xa3 49')

        # converting a unicode object to a latin-1 encoded string
        self.assertEqual(unicode_to_str(u'\xa3 49', 'latin-1'), '\xa3 49')

        # converting a regular string to string should return the same object
        self.assertEqual(unicode_to_str('lel\xf1e'), 'lel\xf1e')

        # converting a strange object should raise TypeError
        self.assertRaises(TypeError, unicode_to_str, unittest)

        # check errors argument works
        assert '?' in unicode_to_str(u'a\ufffdb', 'latin-1', errors='replace')

Пример #9

0

Показать файл

def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
        encoding=None):
    """Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths and query arguments. non-ASCII characters are
      percent-encoded using UTF-8 (RFC-3986)
    - normalize all spaces (in query arguments) '+' (plus symbol)
    - normalize percent encodings case (%2f -> %2F)
    - remove query arguments with blank values (unless keep_blank_values is True)
    - remove fragments (unless keep_fragments is True)

    The url passed can be a str or unicode, while the url returned is always a
    str.

    For examples see the tests in scrapy.tests.test_utils_url
    """

    url = unicode_to_str(url, encoding)
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
    keyvals = cgi.parse_qsl(query, keep_blank_values)
    keyvals.sort()
    query = urllib.urlencode(keyvals)
    path = safe_url_string(urllib.unquote(path))
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse(
        (scheme, netloc.lower(), path, params, query, fragment))

Пример #10

0

Показать файл

Файл: scrapylog.py Проект: alepharchives/python-hubstorage

def get_log_item(ev, min_level=log.INFO):
    """Get HubStorage log item for the given Twisted event, or None if no
    document should be inserted
    """
    if ev['system'] == 'scrapy':
        level = ev['logLevel']
    else:
        if ev['isError']:
            level = log.ERROR
        else:
            return # ignore non-scrapy & non-error messages
    if level < min_level:
        return
    msg = ev.get('message')
    if msg:
        msg = unicode_to_str(msg[0])
    failure = ev.get('failure', None)
    if failure:
        msg = failure.getTraceback()
    why = ev.get('why', None)
    if why:
        msg = "%s\n%s" % (why, msg)
    fmt = ev.get('format')
    if fmt:
        try:
            msg = fmt % ev
        except:
            msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev)
            level = log.ERROR
    msg = msg.replace('\n', '\n\t') # to replicate typical scrapy log appeareance
    return {'message': msg, 'level': level, 'time': int(time.time()*1000)}

Пример #11

0

Показать файл

Файл: test_jsonrpc.py Проект: AllenCHM/scrapy-jsonrpc

def _umock(result=None, error=None):
    response = {}
    if result is not None:
        response.update(result=result)
    if error is not None:
        response.update(error=error)
    return BytesIO(unicode_to_str(json.dumps(response)))

Пример #12

0

Показать файл

def _umock(result=None, error=None):
    response = {}
    if result is not None:
        response.update(result=result)
    if error is not None:
        response.update(error=error)
    return BytesIO(unicode_to_str(json.dumps(response)))

Пример #13

0

Показать файл

Файл: url.py Проект: Terrenceyang213/SourceLearningNote-Scrapy-

def parse_url(url, encoding=None):

    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    return url if isinstance(url, ParseResult) else \
        urlparse(unicode_to_str(url, encoding))

Пример #14

0

Показать файл

Файл: url.py Проект: falood/smart-buyer

def get_uid(url):
    """
        get the uid of the url
        algorithm:
        1) get 16 bytes (128 bits) md5, encoded by hex
        2) split the first 8 bytes and the last 8 bytes
        3) convert the two 8 bytes into int
        4) XOR the two 8 bytes
        5) encode the result by hex
    """
    # convert unicode to str (with encode utf-8)
    # this function is str safe, without double encode error
    url = unicode_to_str(url)
    if isinstance(url, types.StringType):
        # md5 is a string represents a 32bytes hex number
        md5 = hashlib.new("md5", url).hexdigest()
        first_half_bytes = md5[:16]
        last_half_bytes = md5[16:]

        # get the two long int
        first_half_int = int(first_half_bytes, 16)
        last_half_int = int(last_half_bytes, 16)

        # XOR the two long int, get a long int
        xor_int = first_half_int ^ last_half_int

        # convert to a hex string
        uid = "%x" % xor_int

        return uid
    else:
        raise Exception('cannot sign a no-string object:%s' % type(url))

Пример #15

0

Показать файл

Файл: url.py Проект: cash2one/smart-buyer

def get_uid(url):
    """
        get the uid of the url
        algorithm:
        1) get 16 bytes (128 bits) md5, encoded by hex
        2) split the first 8 bytes and the last 8 bytes
        3) convert the two 8 bytes into int
        4) XOR the two 8 bytes
        5) encode the result by hex
    """
    # convert unicode to str (with encode utf-8)
    # this function is str safe, without double encode error
    url = unicode_to_str(url)

    if isinstance(url, types.StringType):
        # md5 is a string represents a 32bytes hex number
        md5 = hashlib.new("md5", url).hexdigest()
        first_half_bytes = md5[:16]
        last_half_bytes = md5[16:]

        # get the two long int
        first_half_int = int(first_half_bytes, 16)
        last_half_int = int(last_half_bytes, 16)

        # XOR the two long int, get a long int
        xor_int = first_half_int ^ last_half_int

        # convert to a hex string
        uid = "%x" % xor_int

        return uid

Пример #16

0

Показать файл

Файл: url.py Проект: kenzouyeh/scrapy

def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
        encoding=None):
    """Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths and query arguments. non-ASCII characters are
      percent-encoded using UTF-8 (RFC-3986)
    - normalize all spaces (in query arguments) '+' (plus symbol)
    - normalize percent encodings case (%2f -> %2F)
    - remove query arguments with blank values (unless keep_blank_values is True)
    - remove fragments (unless keep_fragments is True)

    The url passed can be a str or unicode, while the url returned is always a
    str.

    For examples see the tests in scrapy.tests.test_utils_url
    """

    url = unicode_to_str(url, encoding)
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
    keyvals = cgi.parse_qsl(query, keep_blank_values)
    keyvals.sort()
    query = urllib.urlencode(keyvals)
    path = urllib.quote(urllib.unquote(path))
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))

Пример #17

0

Показать файл

Файл: log.py Проект: rahul-c1/scrapy

def _adapt_eventdict(eventDict, log_level=INFO, encoding="utf-8", prepend_level=True):
    """Adapt Twisted log eventDict making it suitable for logging with a Scrapy
    log observer. It may return None to indicate that the event should be
    ignored by a Scrapy log observer.

    `log_level` is the minimum level being logged, and `encoding` is the log
    encoding.
    """
    ev = eventDict.copy()
    if ev["isError"]:
        ev.setdefault("logLevel", ERROR)

    # ignore non-error messages from outside scrapy
    if ev.get("system") != "scrapy" and not ev["isError"]:
        return

    level = ev.get("logLevel")
    if level < log_level:
        return

    spider = ev.get("spider")
    if spider:
        ev["system"] = spider.name

    lvlname = level_names.get(level, "NOLEVEL")
    message = ev.get("message")
    if message:
        message = [unicode_to_str(x, encoding) for x in message]
        if prepend_level:
            message[0] = "%s: %s" % (lvlname, message[0])
        ev["message"] = message

    why = ev.get("why")
    if why:
        why = unicode_to_str(why, encoding)
        if prepend_level:
            why = "%s: %s" % (lvlname, why)
        ev["why"] = why

    fmt = ev.get("format")
    if fmt:
        fmt = unicode_to_str(fmt, encoding)
        if prepend_level:
            fmt = "%s: %s" % (lvlname, fmt)
        ev["format"] = fmt

    return ev

Пример #18

0

Показать файл

Файл: lxmlsel.py Проект: Aaron1011/oh-mainline

 def __init__(self, response=None, text=None, root=None, expr=None, namespaces=None):
     if text:
         self.response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
     else:
         self.response = response
     self._root = root
     self._xpathev = None
     self.namespaces = namespaces
     self.expr = expr

Пример #19

0

Показать файл

Файл: form.py Проект: happyboy310/book-crawler

    def __init__(self, *args, **kwargs):
        formdata = kwargs.pop("formdata", None)
        super(FormRequest, self).__init__(*args, **kwargs)

        if formdata:
            items = formdata.iteritems() if isinstance(formdata, dict) else formdata
            query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding)) for k, v in items]
            self.method = "POST"
            self._set_body(urllib.urlencode(query, doseq=1))
            self.headers["Content-Type"] = "application/x-www-form-urlencoded"

Пример #20

0

Показать файл

Файл: truenet.py Проект: gaddman/trueentropy

 def parse_article(self, response):
   content = unicode_to_str(response.body_as_unicode(),'latin-1','ignore')
   for block in [content[i:i+1000] for i in range(0, len(content), 1000)]:
     # Build the datastructure that RNGADDENTROPY requires
     format = 'ii%is' % len(block)
     entropy_data = struct.pack(format, 8 * len(block), len(block), block)
     # Call the RNGADDENTROPY ioctl
     random_dev_fd = os.open('/dev/random', os.O_WRONLY)
     ioctl(random_dev_fd, RNDADDENTROPY, entropy_data)
     os.close(random_dev_fd)

Пример #21

0

Показать файл

Файл: utils.py Проект: bihicheng/scrapy

def get_crawl_args(message):
    """Return the command-line arguments to use for the scrapy crawl process
    that will be started for this message
    """
    msg = message.copy()
    args = [unicode_to_str(msg['_spider'])]
    del msg['_project'], msg['_spider']
    for k, v in stringify_dict(msg, keys_only=False).items():
        args += ['-a']
        args += ['%s=%s' % (k, v)]
    return args

Пример #22

0

Показать файл

    def __init__(self, response=None, text=None, namespaces=None, _root=None, _expr=None):
        if text is not None:
            response = TextResponse(url='about:blank', \
                body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
        if response is not None:
            _root = LxmlDocument(response, self._parser)

        self.namespaces = namespaces
        self.response = response
        self._root = _root
        self._expr = _expr

Пример #23

0

Показать файл

Файл: utils.py Проект: richard-ma/CodeReading

def get_crawl_args(message):
    """Return the command-line arguments to use for the scrapy crawl process
    that will be started for this message
    """
    msg = message.copy()
    args = [unicode_to_str(msg['_spider'])]
    del msg['_project'], msg['_spider']
    for k, v in stringify_dict(msg, keys_only=False).items():
        args += ['-a']
        args += ['%s=%s' % (k, v)]
    return args

Пример #24

0

Показать файл

Файл: form.py Проект: kenzouyeh/scrapy

    def __init__(self, *args, **kwargs):
        formdata = kwargs.pop('formdata', None)
        super(FormRequest, self).__init__(*args, **kwargs)

        if formdata:
            items = formdata.iteritems() if isinstance(formdata, dict) else formdata
            query = [(unicode_to_str(k, self.encoding), _unicode_to_str(v, self.encoding))
                    for k, v in items]
            self.method = 'POST'
            self.body = urllib.urlencode(query, doseq=1)
            self.headers['Content-Type'] = 'application/x-www-form-urlencoded'

Пример #25

0

Показать файл

Файл: lxmlsel.py Проект: purplecow/scrapy

    def __init__(self, response=None, text=None, namespaces=None, _root=None, _expr=None):
        if text is not None:
            response = TextResponse(url='about:blank', \
                body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
        if response is not None:
            _root = LxmlDocument(response, self._parser)

        self.namespaces = namespaces
        self.response = response
        self._root = _root
        self._expr = _expr

Пример #26

0

Показать файл

Файл: form.py Проект: reenvs/self-summary

    def __init__(self, *args, **kwargs):
        formdata = kwargs.pop('formdata', None)
        super(FormRequest, self).__init__(*args, **kwargs)

        if formdata:
            items = formdata.iteritems() if isinstance(formdata,
                                                       dict) else formdata
            query = [(unicode_to_str(k, self.encoding),
                      _unicode_to_str(v, self.encoding)) for k, v in items]
            self.method = 'POST'
            self._set_body(urllib.urlencode(query, doseq=1))
            self.headers['Content-Type'] = 'application/x-www-form-urlencoded'

Пример #27

0

Показать файл

def get_crawl_args_dict(message):
    """Return arguments dictionary to use for output"""
    argsDict = {}
    msg = message.copy()
    args = [unicode_to_str(msg['_spider'])]
    del msg['_project'], msg['_spider']
    settings = msg.pop('settings', {})
    for k, v in stringify_dict(msg, keys_only=False).items():
        argsDict[k] = v
    for k, v in stringify_dict(settings, keys_only=False).items():
        argsDict[k] = v
    return argsDict

Пример #28

0

Показать файл

def add_sample(source):
    """
    Method for adding samples to test samples file
    (use from console)
    """
    count = 0
    while os.path.exists("%s_%d.json" % (SAMPLES_FILE_PREFIX, count)):
        count += 1
    
    open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "wb").write(unicode_to_str(source))
    parsed = list(parse_html(source))
    open("%s_%d.json" % (SAMPLES_FILE_PREFIX, count), "wb")\
        .write(json.dumps(parsed, default=_encode_element, indent=8))

Пример #29

0

Показать файл

Файл: libxml2sel.py Проект: connorsml/scrapy

 def __init__(self, response=None, text=None, node=None, parent=None, expr=None):
     if parent is not None:
         self.doc = parent.doc
         self.xmlNode = node
     elif response:
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     elif text:
         response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     self.expr = expr

Пример #30

0

Показать файл

Файл: log.py Проект: serkanh/scrapy

def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None):
    """Log message according to the level"""
    if level > log_level:
        return
    if domain is not None:
        import warnings
        warnings.warn("'domain' argument of scrapy.log.msg() is deprecated, " \
            "use 'spider' argument instead", DeprecationWarning, stacklevel=2)
    dispatcher.send(signal=logmessage_received, message=message, level=level, \
        spider=spider)
    system = domain or (spider.domain_name if spider else component)
    msg_txt = unicode_to_str("%s: %s" % (level_names[level], message))
    log.msg(msg_txt, system=system)

Пример #31

0

Показать файл

Файл: log.py Проект: serkanh/scrapy

def err(_stuff=None, _why=None, **kwargs):
    if ERROR > log_level:
        return
    domain = kwargs.pop('domain', None)
    spider = kwargs.pop('spider', None)
    component = kwargs.pop('component', BOT_NAME)
    if domain is not None:
        import warnings
        warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \
            "use 'spider' argument instead", DeprecationWarning, stacklevel=2)
    kwargs['system'] = domain or (spider.domain_name if spider else component)
    if _why:
        _why = unicode_to_str("ERROR: %s" % _why)
    log.err(_stuff, _why, **kwargs)

Пример #32

0

Показать файл

Файл: truenet.py Проект: nward/trueentropy

 def parse_article(self, response):
     content = unicode_to_str(response.body_as_unicode(), 'latin-1',
                              'ignore')
     for block in [
             content[i:i + 1000] for i in range(0, len(content), 1000)
     ]:
         # Build the datastructure that RNGADDENTROPY requires
         format = 'ii%is' % len(block)
         entropy_data = struct.pack(format, 8 * len(block), len(block),
                                    block)
         # Call the RNGADDENTROPY ioctl
         random_dev_fd = os.open('/dev/random', os.O_WRONLY)
         ioctl(random_dev_fd, RNDADDENTROPY, entropy_data)
         os.close(random_dev_fd)

Пример #33

0

Показать файл

Файл: lxmlsel.py Проект: richard-ma/CodeReading

 def __init__(self,
              response=None,
              text=None,
              root=None,
              expr=None,
              namespaces=None):
     if text:
         self.response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
     else:
         self.response = response
     self._root = root
     self._xpathev = None
     self.namespaces = namespaces
     self.expr = expr

Пример #34

0

Показать файл

Файл: url.py Проект: kenzouyeh/scrapy

def safe_url_string(url, encoding='utf8'):
    """Convert the given url into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a unicode url is given, it is first converted to str using the given
    encoding (which defaults to 'utf-8'). When passing a encoding, you should
    use the encoding of the original page (the page from which the url was
    extracted from).

    Calling this function on an already "safe" url will return the url
    unmodified.

    Always returns a str.
    """
    s = unicode_to_str(url, encoding)
    return urllib.quote(s,  _safe_chars)

Пример #35

0

Показать файл

Файл: libxml2sel.py Проект: connorsml/scrapy

 def select(self, xpath):
     if hasattr(self.xmlNode, 'xpathEval'):
         self.doc.xpathContext.setContextNode(self.xmlNode)
         xpath = unicode_to_str(xpath, 'utf-8')
         try:
             xpath_result = self.doc.xpathContext.xpathEval(xpath)
         except libxml2.xpathError:
             raise ValueError("Invalid XPath: %s" % xpath)
         if hasattr(xpath_result, '__iter__'):
             return XPathSelectorList([self.__class__(node=node, parent=self, \
                 expr=xpath) for node in xpath_result])
         else:
             return XPathSelectorList([self.__class__(node=xpath_result, \
                 parent=self, expr=xpath)])
     else:
         return XPathSelectorList([])

Пример #36

0

Показать файл

def safe_url_string(url, encoding='utf8'):
    """Convert the given url into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a unicode url is given, it is first converted to str using the given
    encoding (which defaults to 'utf-8'). When passing a encoding, you should
    use the encoding of the original page (the page from which the url was
    extracted from).

    Calling this function on an already "safe" url will return the url
    unmodified.

    Always returns a str.
    """
    s = unicode_to_str(url, encoding)
    return urllib.quote(s, _safe_chars)

Пример #37

0

Показать файл

def jsonrpc_client_call(url, method, *args, **kwargs):
    """Execute a JSON-RPC call on the given url"""
    if args and kwargs:
        raise ValueError("Pass *args or **kwargs but not both to jsonrpc_client_call")
    req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1}
    data = unicode_to_str(json.dumps(req))
    body = urllib.request.urlopen(url, data).read()
    res = json.loads(body.decode('utf-8'))
    if 'result' in res:
        return res['result']
    elif 'error' in res:
        er = res['error']
        raise JsonRpcError(er['code'], er['message'], er['data'])
    else:
        msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
        raise ValueError(msg)

Пример #38

0

Показать файл

 def select(self, xpath):
     if hasattr(self.xmlNode, 'xpathEval'):
         self.doc.xpathContext.setContextNode(self.xmlNode)
         xpath = unicode_to_str(xpath, 'utf-8')
         try:
             xpath_result = self.doc.xpathContext.xpathEval(xpath)
         except libxml2.xpathError:
             raise ValueError("Invalid XPath: %s" % xpath)
         if hasattr(xpath_result, '__iter__'):
             return XPathSelectorList([self.__class__(node=node, parent=self, \
                 expr=xpath) for node in xpath_result])
         else:
             return XPathSelectorList([self.__class__(node=xpath_result, \
                 parent=self, expr=xpath)])
     else:
         return XPathSelectorList([])

Пример #39

0

Показать файл

Файл: lxmlselector.py Проект: steeve/scrapy-lxmlselector

 def __init__(self, response=None, text=None, node=None, parent=None, expr=None,
              use_html5lib=False, use_BeautifulSoup=False, namespaces=None):
     if parent:
         self.doc = parent.doc
         self.xmlNode = node
     elif response:
         self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,
                                                  use_BeautifulSoup)
         self.doc = self.xmlNode.getroottree()
     elif text:
         response = TextResponse(url='about:blank', body=unicode_to_str(text),
                                 encoding='utf-8')
         self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,
                                                  use_BeautifulSoup)
         self.doc = self.xmlNode.getroottree()
     self.expr = expr
     self.namespaces = namespaces or {}

Пример #40

0

Показать файл

 def __init__(self,
              response=None,
              text=None,
              node=None,
              parent=None,
              expr=None):
     if parent is not None:
         self.doc = parent.doc
         self.xmlNode = node
     elif response:
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     elif text:
         response = TextResponse(url='about:blank', \
             body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
         self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
         self.xmlNode = self.doc.xmlDoc
     self.expr = expr

Пример #41

0

Показать файл

Файл: spider_test.py Проект: 2014fgq/sedemo-spider

    def assertObjectMatch(self, expected, actual, msg=None, keys=None):  # {{{
        """If key starts with r:, do regex match, else do equal test"""
        actual_is_dict = hasattr(actual, "__getitem__")

        def my_type(o):
            return str(type(o))[7:-2]

        def parse_key(key):
            how = None
            parts = key.split(":")
            if len(parts) == 2:
                how, key = parts[0], parts[1]
            return (how, key)

        def get_value(obj, key):
            if actual_is_dict:
                return obj.get(key, None)
            else:
                return getattr(obj, key, None)

        def check_match(expected, actual, how):
            if how == "r":  # regex match
                if not expected or not actual:
                    return expected == actual
                else:
                    return re.search(expected, actual) != None
            else:
                return expected == actual

        keys = keys or expected.keys()
        err_lines = []
        for key in keys:
            how, actual_key = parse_key(key)
            ev = expected[key]
            av = get_value(actual, actual_key)
            if not check_match(ev, av, how):
                errmsg = "%s: %s %s != %s %s" % (key, my_type(ev), ev, my_type(av), av)
                err_lines.append(errmsg)
        # end for

        errmsg = "\n".join(err_lines)
        if msg:
            errmsg = msg + "\n" + errmsg
        self.failIf(err_lines, unicode_to_str(errmsg))

Пример #42

0

Показать файл

    def _get_log_item(self, ev):
        """Get HubStorage log item for the given Twisted event, or None if no
        document should be inserted
        """
        if ev['system'] == 'scrapy':
            level = ev['logLevel']
        else:
            if ev['isError']:
                level = logging.ERROR
            else:
                level = logging.INFO

        # It's important to access level trough handler instance,
        # min log level can change at any moment.
        if level < self._hs_loghdlr.level:
            return

        msg = ev.get('message')
        if msg:
            msg = unicode_to_str(msg[0])

        failure = ev.get('failure', None)
        if failure:
            msg = failure.getTraceback()

        why = ev.get('why', None)
        if why:
            msg = "%s\n%s" % (why, msg)

        fmt = ev.get('format')
        if fmt:
            try:
                msg = fmt % ev
            except:
                msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev)
                level = logging.ERROR

        msg = msg.replace(
            '\n', '\n\t')  # to replicate typical scrapy log appeareance
        return {'message': msg, 'level': level}

Пример #43

0

Показать файл

Файл: log.py Проект: duendex/scrapinghub-buildpack-scrapy

    def _get_log_item(self, ev):
        """Get HubStorage log item for the given Twisted event, or None if no
        document should be inserted
        """
        if ev['system'] == 'scrapy':
            level = ev['logLevel']
        else:
            if ev['isError']:
                level = logging.ERROR
            else:
                level = logging.INFO

        # It's important to access level trough handler instance,
        # min log level can change at any moment.
        if level < self._hs_loghdlr.level:
            return

        msg = ev.get('message')
        if msg:
            msg = unicode_to_str(msg[0])

        failure = ev.get('failure', None)
        if failure:
            msg = failure.getTraceback()

        why = ev.get('why', None)
        if why:
            msg = "%s\n%s" % (why, msg)

        fmt = ev.get('format')
        if fmt:
            try:
                msg = fmt % ev
            except:
                msg = "UNABLE TO FORMAT LOG MESSAGE: fmt=%r ev=%r" % (fmt, ev)
                level = logging.ERROR

        msg = msg.replace('\n', '\n\t')  # to replicate typical scrapy log appeareance
        return {'message': msg, 'level': level}

Пример #44

0

Показать файл

 def _add_link(url_sel, alt_sel=None):
     url = flatten([url_sel.extract()])
     alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
     if url:
         ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))

Пример #45

0

Показать файл

Файл: utils.py Проект: marianela4711/Veracitor

def to_scrapy_response(url, body):
    return TextResponse(url=url,
                        body=unicode_to_str(body, 'utf-8'),
                        encoding='utf-8')

Пример #46

0

Показать файл

Файл: form.py Проект: reenvs/self-summary

def _unicode_to_str(string, encoding):
    if hasattr(string, '__iter__'):
        return [unicode_to_str(k, encoding) for k in string]
    else:
        return unicode_to_str(string, encoding)

Пример #47

0

Показать файл

Файл: items.py Проект: raineydavid/regulated-rents

def u_to_str(text):
    unicode_to_str(text,'latin-1','ignore')

Пример #48

0

Показать файл

Файл: bookingisraelcom.py Проект: oceancloud82/scraping

def _urlencode(seq, enc):
    values = [(unicode_to_str(k, enc), unicode_to_str(v, enc)) for k, vs in seq
              for v in (vs if hasattr(vs, '__iter__') else [vs])]
    return urllib.urlencode(values, doseq=1)

Пример #49

0

Показать файл

Файл: pipelines_my.py Проект: easyshell/general_crawler

def unicode_to_gbk(src):
    return unicode_to_str(src, 'gbk', errors='ignore')

Пример #50

0

Показать файл

Файл: unified.py Проект: youngdev/scrapy

def _response_from_text(text, st):
    rt = XmlResponse if st == 'xml' else HtmlResponse
    return rt(url='about:blank', encoding='utf-8',
              body=unicode_to_str(text, 'utf-8'))

Пример #51

0

Показать файл

Файл: url.py Проект: zlszhonglongshen/scrapy

def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    return url if isinstance(url, ParseResult) else \
        urlparse(unicode_to_str(url, encoding))

Пример #52

0

Показать файл

Файл: unified.py Проект: peterlee2008/scrapy

def _response_from_text(text, st):
    rt = XmlResponse if st == "xml" else HtmlResponse
    return rt(url="about:blank", encoding="utf-8", body=unicode_to_str(text, "utf-8"))

Пример #53

0

Показать файл

Файл: form.py Проект: happyboy310/book-crawler

def _unicode_to_str(string, encoding):
    if hasattr(string, "__iter__"):
        return [unicode_to_str(k, encoding) for k in string]
    else:
        return unicode_to_str(string, encoding)

Пример #54

0

Показать файл

 def _unicode_to_str(self, eventDict):
     message = eventDict.get('message')
     if message:
         eventDict['message'] = tuple(
             unicode_to_str(x, self.encoding) for x in message)
     return eventDict

Python unicode_to_str примеры использования