def parse(self, fp, headersonly=False): """Create a message structure from the data in a file. Reads all the data from the file and returns the root of the message structure. Optional headersonly is a flag specifying whether to stop parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ feedparser = EmailFeedParser(self._class) if headersonly: feedparser._set_headersonly() while True: data = fp.read(8192) if not data: break feedparser.feed(data) return feedparser.close()
def parse(self, fp, headersonly=False): """Create a message structure from the data in a file. Reads all the data from the file and returns the root of the message structure. Optional headersonly is a flag specifying whether to stop parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ feedparser = FeedParser(self._class) if headersonly: feedparser._set_headersonly() while True: # <patch> data = fp.read(65536) # </patch> if not data: break feedparser.feed(data) return feedparser.close()
def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None): """ Performs a single HTTP request. The 'uri' is the URI of the HTTP resource and can begin with either 'http' or 'https'. The value of 'uri' must be an absolute URI. The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. There is no restriction on the methods allowed. The 'body' is the entity body to be sent with the request. It is a string object. Any extra headers that are to be sent with the request should be provided in the 'headers' dictionary. The maximum number of redirect to follow before raising an exception is 'redirections. The default is 5. The return value is a tuple of (response, content), the first being and instance of the 'Response' class, the second being a string that contains the response entity body. """ try: if headers is None: headers = {} else: headers = self._normalize_headers(headers) if 'user-agent' not in headers: headers['user-agent'] = "Python-httplib2/%s" % __version__ uri = iri2uri(uri) (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) domain_port = authority.split(":")[0:2] if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http': scheme = 'https' authority = domain_port[0] conn_key = scheme+":"+authority if conn_key in self.connections: conn = self.connections[conn_key] else: if not connection_type: connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout certs = list(self.certificates.iter(authority)) if scheme == 'https' and certs: conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0], cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info) else: conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info) conn.set_debuglevel(debuglevel) if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers: headers['accept-encoding'] = 'gzip, deflate' info = email.message.Message() cached_value = None if self.cache: cachekey = defrag_uri cached_value = self.cache.get(cachekey) if cached_value: # info = email.message_from_string(cached_value) # # Need to replace the line above with the kludge below # to fix the non-existent bug not fixed in this # bug report: http://mail.python.org/pipermail/python-bugs-list/2005-September/030289.html try: info, content = cached_value.split(b'\r\n\r\n', 1) info = info.decode('utf-8') feedparser = email.feedparser.FeedParser() feedparser.feed(info) info = feedparser.close() feedparser._parse = None except IndexError: self.cache.delete(cachekey) cachekey = None cached_value = None else: cachekey = None if method in self.optimistic_concurrency_methods and self.cache and 'etag' in info and not self.ignore_etag and 'if-match' not in headers: # http://www.w3.org/1999/04/Editing/ headers['if-match'] = info['etag'] if method not in ["GET", "HEAD"] and self.cache and cachekey: # RFC 2616 Section 13.10 self.cache.delete(cachekey) # Check the vary header in the cache to see if this request # matches what varies in the cache. if method in ['GET', 'HEAD'] and 'vary' in info: vary = info['vary'] vary_headers = vary.lower().replace(' ', '').split(',') for header in vary_headers: key = '-varied-%s' % header value = info[key] if headers.get(header, '') != value: cached_value = None break if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers: if '-x-permanent-redirect-url' in info: # Should cached permanent redirects be counted in our redirection count? For now, yes. (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1) response.previous = Response(info) response.previous.fromcache = True else: # Determine our course of action: # Is the cached entry fresh or stale? # Has the client requested a non-cached response? # # There seems to be three possible answers: # 1. [FRESH] Return the cache entry w/o doing a GET # 2. [STALE] Do the GET (but add in cache validators if available) # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request entry_disposition = _entry_disposition(info, headers) if entry_disposition == "FRESH": if not cached_value: info['status'] = '504' content = b"" response = Response(info) if cached_value: response.fromcache = True return (response, content) if entry_disposition == "STALE": if 'etag' in info and not self.ignore_etag and not 'if-none-match' in headers: headers['if-none-match'] = info['etag'] if 'last-modified' in info and not 'last-modified' in headers: headers['if-modified-since'] = info['last-modified'] elif entry_disposition == "TRANSPARENT": pass (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) if response.status == 304 and method == "GET": # Rewrite the cache entry with the new end-to-end headers # Take all headers that are in response # and overwrite their values in info. # unless they are hop-by-hop, or are listed in the connection header. for key in _get_end2end_headers(response): info[key] = response[key] merged_response = Response(info) if hasattr(response, "_stale_digest"): merged_response._stale_digest = response._stale_digest _updateCache(headers, merged_response, content, self.cache, cachekey) response = merged_response response.status = 200 response.fromcache = True elif response.status == 200: content = new_content else: self.cache.delete(cachekey) content = new_content else: cc = _parse_cache_control(headers) if 'only-if-cached'in cc: info['status'] = '504' response = Response(info) content = b"" else: (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) except Exception as e: if self.force_exception_to_status_code: if isinstance(e, HttpLib2ErrorWithResponse): response = e.response content = e.content response.status = 500 response.reason = str(e) elif isinstance(e, socket.timeout): content = b"Request Timeout" response = Response( { "content-type": "text/plain", "status": "408", "content-length": len(content) }) response.reason = "Request Timeout" else: content = str(e).encode('utf-8') response = Response( { "content-type": "text/plain", "status": "400", "content-length": len(content) }) response.reason = "Bad Request" else: raise return (response, content)