Exemplo n.º 1
0
        def setopt(name, value):
            option_name = 'CURLOPT_%s' % name.upper()
            if name.islower() and hasattr(const, option_name):
                option_value = getattr(const, option_name)

                if name in self._CURLOPT_SLIST:
                    value = lib.list2pointer_slist(value)
                    if name in self._slist:
                        lib.curl_slist_free_all(self._slist[name])
                        del self._slist[name]
                    else:
                        self._slist[name] = value
                elif hasattr(prototype, name):
                    if callable(value):
                        value = getattr(prototype, name)(value)
                elif name == 'postfields' and isinstance(value, dict):
                    value = urllib.urlencode(value)
                elif name == 'share':
                    value = value._handle
                elif name == 'url' and value:
                    value = iri2uri(value)
                    if isinstance(value, unicode):
                        value = value.encode('utf-8')

                # setopt
                lib.curl_easy_setopt(self._handle, option_value, value)
                #print option_name, value
                self._buff[option_name] = value
            else:
                raise ValueError('invalid option name "%s"' % name)
Exemplo n.º 2
0
        def setopt(name, value):
            option_name  = 'CURLOPT_%s' % name.upper()
            if name.islower() and hasattr(const, option_name):
                option_value = getattr(const, option_name)
                
                if name in self._CURLOPT_SLIST:
                    value = lib.list2pointer_slist(value)
                    if name in self._slist:
                        lib.curl_slist_free_all(self._slist[name])
                        del self._slist[name]
                    else:
                        self._slist[name] = value
                elif hasattr(prototype, name):
                    if callable(value):
                        value = getattr(prototype, name)(value)
                elif name == 'postfields' and isinstance(value, dict):
                    value = urllib.urlencode(value)
                elif name == 'share':
                    value = value._handle
                elif name == 'url' and value:
                    value = iri2uri(value)
                    if isinstance(value, unicode):
                        value = value.encode('utf-8')

                
                # setopt
                lib.curl_easy_setopt(self._handle, option_value, value)
                #print option_name, value
                self._buff[option_name] = value
            else:
                raise ValueError('invalid option name "%s"' % name)
Exemplo n.º 3
0
    def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS):
        """ Performs a single HTTP request.
The 'uri' is the URI of the HTTP resource and can begin 
with either 'http' or 'https'. The value of 'uri' must be an absolute URI.

The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. 
There is no restriction on the methods allowed.

The 'body' is the entity body to be sent with the request. It is a string
object.

Any extra headers that are to be sent with the request should be provided in the
'headers' dictionary.

The maximum number of redirect to follow before raising an 
exception is 'redirections. The default is 5.

The return value is a tuple of (response, content), the first 
being and instance of the 'Response' class, the second being 
a string that contains the response entity body.
        """
        if headers is None:
            headers = {}
        else:
            headers = _normalize_headers(headers)

        if not headers.has_key('user-agent'):
            headers['user-agent'] = "Python-httplib2/%s" % __version__

        uri = iri2uri(uri)

        (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)

        if not self.connections.has_key(scheme+":"+authority):
            connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
            conn = self.connections[scheme+":"+authority] = connection_type(authority)
            conn.set_debuglevel(debuglevel)
        else:
            conn = self.connections[scheme+":"+authority]

        if method in ["GET", "HEAD"] and 'range' not in headers:
            headers['accept-encoding'] = 'compress, gzip'

        info = email.Message.Message()
        cached_value = None
        if self.cache:
            cachekey = defrag_uri
            cached_value = self.cache.get(cachekey)
            if cached_value:
                try:
                    info = email.message_from_string(cached_value)
                    content = cached_value.split('\r\n\r\n', 1)[1]
                except Exception, e:
                    self.cache.delete(cachekey)
                    cachekey = None
                    cached_value = None
Exemplo n.º 4
0
    def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None):
        """ Performs a single HTTP request.
The 'uri' is the URI of the HTTP resource and can begin
with either 'http' or 'https'. The value of 'uri' must be an absolute URI.

The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
There is no restriction on the methods allowed.

The 'body' is the entity body to be sent with the request. It is a string
object.

Any extra headers that are to be sent with the request should be provided in the
'headers' dictionary.

The maximum number of redirect to follow before raising an
exception is 'redirections. The default is 5.

The return value is a tuple of (response, content), the first
being and instance of the 'Response' class, the second being
a string that contains the response entity body.
        """
        try:
            if headers is None:
                headers = {}
            else:
                headers = _normalize_headers(headers)

            if not headers.has_key('user-agent'):
                headers['user-agent'] = "Python-httplib2/%s" % __version__

            uri = iri2uri(uri)

            (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)

            conn_key = scheme+":"+authority
            if conn_key in self.connections:
                conn = self.connections[conn_key]
            else:
                if not connection_type:
                    connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
                certs = list(self.certificates.iter(authority))
                if scheme == 'https' and certs:
                    conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0],
                        cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info)
                else:
                    conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info)
                conn.set_debuglevel(debuglevel)

            if method in ["GET", "HEAD"] and 'range' not in headers:
                headers['accept-encoding'] = 'compress, gzip'

            info = email.Message.Message()
            cached_value = None
            if self.cache:
                cachekey = defrag_uri
                cached_value = self.cache.get(cachekey)
                if cached_value:
                    info = email.message_from_string(cached_value)
                    try:
                        content = cached_value.split('\r\n\r\n', 1)[1]
                    except IndexError:
                        self.cache.delete(cachekey)
                        cachekey = None
                        cached_value = None
            else:
                cachekey = None

            if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
                # http://www.w3.org/1999/04/Editing/
                headers['if-match'] = info['etag']

            if method not in ["GET", "HEAD"] and self.cache and cachekey:
                # RFC 2616 Section 13.10
                self.cache.delete(cachekey)

            if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
                if info.has_key('-x-permanent-redirect-url'):
                    # Should cached permanent redirects be counted in our redirection count? For now, yes.
                    (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
                    response.previous = Response(info)
                    response.previous.fromcache = True
                else:
                    # Determine our course of action:
                    #   Is the cached entry fresh or stale?
                    #   Has the client requested a non-cached response?
                    #
                    # There seems to be three possible answers:
                    # 1. [FRESH] Return the cache entry w/o doing a GET
                    # 2. [STALE] Do the GET (but add in cache validators if available)
                    # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
                    entry_disposition = _entry_disposition(info, headers)

                    if entry_disposition == "FRESH":
                        if not cached_value:
                            info['status'] = '504'
                            content = ""
                        response = Response(info)
                        if cached_value:
                            response.fromcache = True
                        return (response, content)

                    if entry_disposition == "STALE":
                        if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
                            headers['if-none-match'] = info['etag']
                        if info.has_key('last-modified') and not 'last-modified' in headers:
                            headers['if-modified-since'] = info['last-modified']
                    elif entry_disposition == "TRANSPARENT":
                        pass

                    (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)

                if response.status == 304 and method == "GET":
                    # Rewrite the cache entry with the new end-to-end headers
                    # Take all headers that are in response
                    # and overwrite their values in info.
                    # unless they are hop-by-hop, or are listed in the connection header.

                    for key in _get_end2end_headers(response):
                        info[key] = response[key]
                    merged_response = Response(info)
                    if hasattr(response, "_stale_digest"):
                        merged_response._stale_digest = response._stale_digest
                    _updateCache(headers, merged_response, content, self.cache, cachekey)
                    response = merged_response
                    response.status = 200
                    response.fromcache = True

                elif response.status == 200:
                    content = new_content
                else:
                    self.cache.delete(cachekey)
                    content = new_content
            else:
                (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
        except Exception, e:
            if self.force_exception_to_status_code:
                if isinstance(e, HttpLib2ErrorWithResponse):
                    response = e.response
                    content = e.content
                    response.status = 500
                    response.reason = str(e)
                elif isinstance(e, socket.timeout):
                    content = "Request Timeout"
                    response = Response( {
                            "content-type": "text/plain",
                            "status": "408",
                            "content-length": len(content)
                            })
                    response.reason = "Request Timeout"
                else:
                    content = str(e)
                    response = Response( {
                            "content-type": "text/plain",
                            "status": "400",
                            "content-length": len(content)
                            })
                    response.reason = "Bad Request"
            else:
                raise
Exemplo n.º 5
0
    def request(self,
                uri,
                method="GET",
                body=None,
                headers=None,
                redirections=DEFAULT_MAX_REDIRECTS,
                connection_type=None):
        """ Performs a single HTTP request.
The 'uri' is the URI of the HTTP resource and can begin
with either 'http' or 'https'. The value of 'uri' must be an absolute URI.

The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
There is no restriction on the methods allowed.

The 'body' is the entity body to be sent with the request. It is a string
object.

Any extra headers that are to be sent with the request should be provided in the
'headers' dictionary.

The maximum number of redirect to follow before raising an
exception is 'redirections. The default is 5.

The return value is a tuple of (response, content), the first
being and instance of the 'Response' class, the second being
a string that contains the response entity body.
        """
        try:
            if headers is None:
                headers = {}
            else:
                headers = _normalize_headers(headers)

            if not headers.has_key('user-agent'):
                headers['user-agent'] = "Python-httplib2/%s" % __version__

            uri = iri2uri(uri)

            (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)

            conn_key = scheme + ":" + authority
            if conn_key in self.connections:
                conn = self.connections[conn_key]
            else:
                if not connection_type:
                    connection_type = (
                        scheme == 'https'
                    ) and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout
                certs = list(self.certificates.iter(authority))
                if scheme == 'https' and certs:
                    conn = self.connections[conn_key] = connection_type(
                        authority,
                        key_file=certs[0][0],
                        cert_file=certs[0][1],
                        timeout=self.timeout,
                        proxy_info=self.proxy_info)
                else:
                    conn = self.connections[conn_key] = connection_type(
                        authority,
                        timeout=self.timeout,
                        proxy_info=self.proxy_info)
                conn.set_debuglevel(debuglevel)

            if method in ["GET", "HEAD"] and 'range' not in headers:
                headers['accept-encoding'] = 'compress, gzip'

            info = email.Message.Message()
            cached_value = None
            if self.cache:
                cachekey = defrag_uri
                cached_value = self.cache.get(cachekey)
                if cached_value:
                    info = email.message_from_string(cached_value)
                    try:
                        content = cached_value.split('\r\n\r\n', 1)[1]
                    except IndexError:
                        self.cache.delete(cachekey)
                        cachekey = None
                        cached_value = None
            else:
                cachekey = None

            if method in ["PUT"] and self.cache and info.has_key(
                    'etag'
            ) and not self.ignore_etag and 'if-match' not in headers:
                # http://www.w3.org/1999/04/Editing/
                headers['if-match'] = info['etag']

            if method not in ["GET", "HEAD"] and self.cache and cachekey:
                # RFC 2616 Section 13.10
                self.cache.delete(cachekey)

            if cached_value and method in [
                    "GET", "HEAD"
            ] and self.cache and 'range' not in headers:
                if info.has_key('-x-permanent-redirect-url'):
                    # Should cached permanent redirects be counted in our redirection count? For now, yes.
                    (response, new_content) = self.request(
                        info['-x-permanent-redirect-url'],
                        headers=headers,
                        redirections=redirections - 1)
                    response.previous = Response(info)
                    response.previous.fromcache = True
                else:
                    # Determine our course of action:
                    #   Is the cached entry fresh or stale?
                    #   Has the client requested a non-cached response?
                    #
                    # There seems to be three possible answers:
                    # 1. [FRESH] Return the cache entry w/o doing a GET
                    # 2. [STALE] Do the GET (but add in cache validators if available)
                    # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
                    entry_disposition = _entry_disposition(info, headers)

                    if entry_disposition == "FRESH":
                        if not cached_value:
                            info['status'] = '504'
                            content = ""
                        response = Response(info)
                        if cached_value:
                            response.fromcache = True
                        return (response, content)

                    if entry_disposition == "STALE":
                        if info.has_key(
                                'etag'
                        ) and not self.ignore_etag and not 'if-none-match' in headers:
                            headers['if-none-match'] = info['etag']
                        if info.has_key('last-modified'
                                        ) and not 'last-modified' in headers:
                            headers['if-modified-since'] = info[
                                'last-modified']
                    elif entry_disposition == "TRANSPARENT":
                        pass

                    (response, new_content) = self._request(
                        conn, authority, uri, request_uri, method, body,
                        headers, redirections, cachekey)

                if response.status == 304 and method == "GET":
                    # Rewrite the cache entry with the new end-to-end headers
                    # Take all headers that are in response
                    # and overwrite their values in info.
                    # unless they are hop-by-hop, or are listed in the connection header.

                    for key in _get_end2end_headers(response):
                        info[key] = response[key]
                    merged_response = Response(info)
                    if hasattr(response, "_stale_digest"):
                        merged_response._stale_digest = response._stale_digest
                    _updateCache(headers, merged_response, content, self.cache,
                                 cachekey)
                    response = merged_response
                    response.status = 200
                    response.fromcache = True

                elif response.status == 200:
                    content = new_content
                else:
                    self.cache.delete(cachekey)
                    content = new_content
            else:
                (response, content) = self._request(conn, authority, uri,
                                                    request_uri, method, body,
                                                    headers, redirections,
                                                    cachekey)
        except Exception, e:
            if self.force_exception_to_status_code:
                if isinstance(e, HttpLib2ErrorWithResponse):
                    response = e.response
                    content = e.content
                    response.status = 500
                    response.reason = str(e)
                elif isinstance(e, socket.timeout):
                    content = "Request Timeout"
                    response = Response({
                        "content-type": "text/plain",
                        "status": "408",
                        "content-length": len(content)
                    })
                    response.reason = "Request Timeout"
                else:
                    content = str(e)
                    response = Response({
                        "content-type": "text/plain",
                        "status": "400",
                        "content-length": len(content)
                    })
                    response.reason = "Bad Request"
            else:
                raise
Exemplo n.º 6
0
	def getpage(self, pgreq, addlHeaders = None, returnMultiple = False, callBack=None, postData=None, soup=False):

		# pgreq = fixurl(pgreq)
		# print pgreq
		# print type(pgreq)

		originalString = pgreq

		log = self.log

		pgctnt = "Failed"
		pghandle = None

		loopctr = 0


		# Encode Unicode URL's properly
		pgreq = iri2uri.iri2uri(pgreq)

		try:
			# TODO: make this more sensible
			if addlHeaders != None and  postData != None:
				log.info("Making a post-request with additional headers!")
				pgreq = urllib.request.Request(pgreq, headers=addlHeaders, data=urllib.parse.urlencode(postData).encode("utf-8"))
			elif addlHeaders != None:
				pgreq = urllib.request.Request(pgreq, headers=addlHeaders)
			elif postData != None:
				log.info("Making a post request!")
				pgreq = urllib.request.Request(pgreq, data=urllib.parse.urlencode(postData).encode("utf-8"))

			else:
				pgreq = urllib.request.Request(pgreq)
		except:
			log.critical("Invalid header or url")
			raise

		errored = False
		lastErr = ""

		delay = 1.5
		if not self.testMode:
			while 1:

				loopctr = loopctr + 1



				if loopctr > self.errorOutCount:
					log.error("Failed to retrieve Website : %s at %s All Attempts Exhausted", pgreq.get_full_url(), time.ctime(time.time()))
					pgctnt = "Failed"
					try:
						print(("Critical Failure to retrieve page! %s at %s, attempt %s" % (pgreq.get_full_url(), time.ctime(time.time()), loopctr)))
						print(("Error:", lastErr))
						print("Exiting")
					except:
						print("And the URL could not be printed due to an encoding error")
					break

				#print "execution", loopctr
				try:

					# print("request type = ", type(pgreq))
					pghandle = self.opener.open(pgreq)					# Get Webpage

				except urllib.error.HTTPError as e:								# Lotta logging
					log.warning("Error opening page: %s at %s On Attempt %s.", pgreq.get_full_url(), time.ctime(time.time()), loopctr)
					log.warning("Error Code: %s", e)

					#traceback.print_exc()
					lastErr = e
					try:
						log.warning("Error opening page: %s at %s On Attempt %s.", pgreq.get_full_url(), time.ctime(time.time()), loopctr)
						log.warning("Error: %s, Original URL: %s", e, originalString)
						errored = True
					except:
						log.warning("And the URL could not be printed due to an encoding error")

					if e.code == 404:
						#print "Unrecoverable - Page not found. Breaking"
						log.critical("Unrecoverable - Page not found. Breaking")
						break

					time.sleep(delay)

				except UnicodeEncodeError:
					log.critical("Unrecoverable Unicode issue retreiving page - %s", originalString)
					break

				except Exception:
					errored = True
					#traceback.print_exc()
					lastErr = sys.exc_info()
					log.warning("Retreival failed. Traceback:")
					log.warning(lastErr)
					log.warning(traceback.format_exc())

					log.warning("Error Retrieving Page! - Trying again - Waiting 2.5 seconds")

					try:
						print(("Error on page - %s" % originalString))
					except:
						print("And the URL could not be printed due to an encoding error")

					time.sleep(delay)


					continue

				if pghandle != None:
					try:

						log.info("Request for URL: %s succeeded at %s On Attempt %s. Recieving...", pgreq.get_full_url(), time.ctime(time.time()), loopctr)
						if callBack:
							pgctnt = self.chunkRead(pghandle, 2 ** 17, reportHook = callBack)
						else:
							pgctnt = pghandle.read()
						if pgctnt != None:

							log.info("URL fully retrieved.")

							preDecompSize = len(pgctnt)/1000.0

							encoded = pghandle.headers.get('Content-Encoding')
							#preLen = len(pgctnt)
							if encoded == 'deflate':
								compType = "deflate"

								pgctnt = zlib.decompress(pgctnt, -zlib.MAX_WBITS)

							elif encoded == 'gzip':
								compType = "gzip"

								buf = io.BytesIO(pgctnt)
								f = gzip.GzipFile(fileobj=buf)
								pgctnt = f.read()

							elif encoded == "sdch":
								raise ValueError("Wait, someone other then google actually supports SDCH compression?")

							else:
								compType = "none"

							decompSize = len(pgctnt)/1000.0
							# self.log.info("Page content type = %s", type(pgctnt))
							cType = pghandle.headers.get("Content-Type")
							self.log.info("Compression type = %s. Content Size compressed = %0.3fK. Decompressed = %0.3fK. File type: %s.", compType, preDecompSize, decompSize, cType)

							if "text/html" in cType:				# If this is a html/text page, we want to decode it using the local encoding

								if (";" in cType) and ("=" in cType): 		# the server is reporting an encoding. Now we use it to decode the

									dummy_docType, charset = cType.split(";")
									charset = charset.split("=")[-1]


								else:		# The server is not reporting an encoding in the headers.

									# this *should* probably be done using a parser.
									# However, it seems to be grossly overkill to shove the whole page (which can be quite large) through a parser just to pull out a tag that
									# should be right near the page beginning anyways.
									# As such, it's a regular expression for the moment

									# Regex is of bytes type, since we can't convert a string to unicode until we know the encoding the
									# bytes string is using, and we need the regex to get that encoding
									coding = re.search(b"charset=[\'\"]?([a-zA-Z0-9\-]*)[\'\"]?", pgctnt, flags=re.IGNORECASE)

									cType = b""
									if coding:
										cType = coding.group(1)

									if (b";" in cType) and (b"=" in cType): 		# the server is reporting an encoding. Now we use it to decode the

										dummy_docType, charset = cType.split(b";")
										charset = charset.split(b"=")[-1]

									else:
										charset = "iso-8859-1"

								try:
									pgctnt = str(pgctnt, charset)

								except UnicodeDecodeError:
									self.log.error("Encoding Error! Stripping invalid chars.")
									pgctnt = pgctnt.decode('utf-8', errors='ignore')

								if soup:
									pgctnt = bs4.BeautifulSoup(pgctnt)
							elif "text/plain" in cType or "text/xml" in cType:
								pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup

							elif "text" in cType:
								self.log.critical("Unknown content type!")
								self.log.critical(cType)

								print("Unknown content type!")
								print(cType)


							break


					except:
						print(("pghandle = ", pghandle))

						traceback.print_exc()
						log.error(sys.exc_info())
						log.error("Error Retrieving Page! - Transfer failed. Waiting %s seconds before retrying", delay)

						try:
							print(("Critical Failure to retrieve page! %s at %s" % (pgreq.get_full_url(), time.ctime(time.time()))))
							print("Exiting")
						except:
							print("And the URL could not be printed due to an encoding error")
						print()
						log.error(pghandle)
						time.sleep(delay)






		if errored and pghandle != None:
			print(("Later attempt succeeded %s" % pgreq.get_full_url()))
			#print len(pgctnt)
		elif errored and pghandle == None:
			raise urllib.error.URLError("Failed to retreive page!")

		if returnMultiple:
			if self.testMode:
				raise ValueError("testing mode does not support multiple return values yet!")
			return pgctnt, pghandle
		else:
			if self.testMode:
				return self.testMode
			else:
				return pgctnt