Exemplo n.º 1
0
Arquivo: base.py Projeto: lorien/grab
    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()
        if isinstance(content, six.text_type):
            raise error.GrabMisuseError('Method `setup_document` accepts only '
                                        'byte string in `content` argument.')

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = b'HTTP/1.1 200 OK\r\n\r\n'
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc
Exemplo n.º 2
0
Arquivo: curl.py Projeto: lyicy/grab
    def prepare_response(self, grab):
        if self.body_file:
            self.body_file.close()
        response = Document()

        response.head = b''.join(self.response_header_chunks)

        if self.body_path:
            response.body_path = self.body_path
        else:
            response.body = b''.join(self.response_body_chunks)

        # Clear memory
        self.response_header_chunks = []
        self.response_body_chunks = []

        response.code = self.curl.getinfo(pycurl.HTTP_CODE)
        response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self.curl.getinfo(pycurl.EFFECTIVE_URL)

        response.parse(charset=grab.config['document_charset'])

        response.cookies = CookieManager(self.extract_cookiejar())

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
Exemplo n.º 3
0
 def custom_prepare_response_func(transport, grab):
     doc = Document()
     doc.head = cache_item['head']
     doc.body = body
     doc.code = cache_item['response_code']
     doc.download_size = len(body)
     doc.upload_size = 0
     doc.download_speed = 0
     doc.url = cache_item['response_url']
     doc.parse(charset=grab.config['document_charset'])
     doc.cookies = CookieManager(transport.extract_cookiejar())
     doc.from_cache = True
     return doc
Exemplo n.º 4
0
    def prepare_response(self, grab):
        # Information about urllib3
        # On python2 urllib3 headers contains original binary data
        # On python3 urllib3 headers are converted to unicode
        # using latin encoding
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Document()

            head = ''
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='utf-8')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                bytes_read = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        bytes_read += len(chunk)
                        chunks.append(chunk)
                        if maxsize and bytes_read > maxsize:
                            # reached limit on bytes to read
                            break
                    else:
                        break
                    if self._request.timeout:
                        if (time.time() - self._request.op_started >
                                self._request.timeout):
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request.response_path:
                response.body_path = self._request.response_path
                # FIXME: Quick dirty hack, actullay, response is fully
                # read into memory
                self._request.response_file.write(read_with_timeout())
                self._request.response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = (self.curl
            #                             .getinfo(pycurl.NAMELOOKUP_TIME))
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = (self._response.get_redirect_location()
                            or self._request.url)

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                #if key == 'Location':
                #    import pdb; pdb.set_trace()
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar()  # self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
Exemplo n.º 5
0
 def _get_doc(self):
     if self._doc is None:
         self._doc = Document(self)
     return self._doc
Exemplo n.º 6
0
    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = ''
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc
Exemplo n.º 7
0
    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()
        if isinstance(content, six.text_type):
            raise error.GrabMisuseError('Method `setup_document` accepts only '
                                        'byte string in `content` argument.')

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = b'HTTP/1.1 200 OK\r\n\r\n'
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc
Exemplo n.º 8
0
    def prepare_response(self, grab):
        # Information about urllib3
        # On python2 urllib3 headers contains original binary data
        # On python3 urllib3 headers are converted to unicode
        # using latin encoding
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Document()

            head = ''
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='utf-8')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                bytes_read = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        bytes_read += len(chunk)
                        chunks.append(chunk)
                        if maxsize and bytes_read > maxsize:
                            # reached limit on bytes to read
                            break
                    else:
                        break
                    if self._request.timeout:
                        if (time.time() - self._request.op_started
                                > self._request.timeout):
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request.response_path:
                response.body_path = self._request.response_path
                # FIXME: Quick dirty hack, actullay, response is fully
                # read into memory
                self._request.response_file.write(read_with_timeout())
                self._request.response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = (self.curl
            #                             .getinfo(pycurl.NAMELOOKUP_TIME))
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = (self._response.get_redirect_location()
                            or self._request.url)

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                #if key == 'Location':
                #    import pdb; pdb.set_trace()
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar() # self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
Exemplo n.º 9
0
 def custom_prepare_response_func(transport, grab):
     doc = Document()
     doc.head = cache_item['head']
     doc.body = body
     doc.code = cache_item['response_code']
     doc.download_size = len(body)
     doc.upload_size = 0
     doc.download_speed = 0
     doc.url = cache_item['response_url']
     doc.parse(charset=grab.config['document_charset'])
     doc.cookies = CookieManager(transport.extract_cookiejar())
     doc.from_cache = True
     return doc
Exemplo n.º 10
0
Arquivo: curl.py Projeto: lorien/grab
    def prepare_response(self, grab):
        if self.body_file:
            self.body_file.close()
        response = Document()

        response.head = b''.join(self.response_header_chunks)

        if self.body_path:
            response.body_path = self.body_path
        else:
            response.body = b''.join(self.response_body_chunks)

        # Clear memory
        self.response_header_chunks = []
        self.response_body_chunks = []

        response.code = self.curl.getinfo(pycurl.HTTP_CODE)
        response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self.curl.getinfo(pycurl.EFFECTIVE_URL)

        response.parse(charset=grab.config['document_charset'])

        response.cookies = CookieManager(self.extract_cookiejar())

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
Exemplo n.º 11
0
    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = ''
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc