def _on_connect(self): self._remove_timeout() if self.request.request_timeout: self._timeout = self.io_loop.add_timeout( self.start_time + self.request.request_timeout, stack_context.wrap(self._on_timeout)) if (self.request.method not in self._SUPPORTED_METHODS and not self.request.allow_nonstandard_methods): raise KeyError("unknown method %s" % self.request.method) for key in ('network_interface', 'proxy_host', 'proxy_port', 'proxy_username', 'proxy_password'): if getattr(self.request, key, None): raise NotImplementedError('%s not supported' % key) if "Connection" not in self.request.headers: self.request.headers["Connection"] = "close" if "Host" not in self.request.headers: if '@' in self.parsed.netloc: self.request.headers["Host"] = self.parsed.netloc.rpartition('@')[-1] else: self.request.headers["Host"] = self.parsed.netloc username, password = None, None if self.parsed.username is not None: username, password = self.parsed.username, self.parsed.password elif self.request.auth_username is not None: username = self.request.auth_username password = self.request.auth_password or '' if username is not None: auth = utf8(username) + b":" + utf8(password) self.request.headers["Authorization"] = (b"Basic " + base64.b64encode(auth)) if self.request.user_agent: self.request.headers["User-Agent"] = self.request.user_agent if not self.request.allow_nonstandard_methods: if self.request.method in ("POST", "PATCH", "PUT"): assert self.request.body is not None else: assert self.request.body is None if self.request.body is not None: self.request.headers["Content-Length"] = str(len( self.request.body)) if (self.request.method == "POST" and "Content-Type" not in self.request.headers): self.request.headers["Content-Type"] = "application/x-www-form-urlencoded" if self.request.use_gzip: self.request.headers["Accept-Encoding"] = "gzip" req_path = ((self.parsed.path or '/') + (('?' + self.parsed.query) if self.parsed.query else '')) request_lines = [utf8("%s %s HTTP/1.1" % (self.request.method, req_path))] for k, v in self.request.headers.get_all(): line = utf8(k) + b": " + utf8(v) if b'\n' in line: raise ValueError('Newline in header: ' + repr(line)) request_lines.append(line) self.stream.write(b"\r\n".join(request_lines) + b"\r\n\r\n") if self.request.body is not None: self.stream.write(self.request.body) self.stream.read_until_regex(b"\r?\n\r?\n", self._on_headers)
def parse_image(self, url, filename = None): """处理img标签的src并映射到本地文件""" url = escape.utf8(url) image_guid = hashlib.sha1(url).hexdigest() x = url.split('.') ext = 'jpg' if len(x) > 1: ext = x[-1] if len(ext) > 4: ext = ext[0:3] ext = re.sub('[^a-zA-Z]', '', ext) ext = ext.lower() if ext not in ['jpg', 'jpeg', 'gif', 'png', 'bmp']: ext = 'jpg' y = url.split('/') h = hashlib.sha1(str(y[2])).hexdigest() hash_dir = os.path.join(h[0:1], h[1:2]) filename = image_guid + '.' + ext img_dir = os.path.join(self.work_dir, 'data', 'images', hash_dir) fullname = os.path.join(img_dir, filename) if not os.path.exists(img_dir): os.makedirs(img_dir) localimage = 'images/%s/%s' % (hash_dir, filename) return localimage, fullname
def parse_image(self, url, filename=None): """处理img标签的src并映射到本地文件""" url = escape.utf8(url) image_guid = hashlib.sha1(url).hexdigest() x = url.split('.') ext = 'jpg' if len(x) > 1: ext = x[-1] if len(ext) > 4: ext = ext[0:3] ext = re.sub('[^a-zA-Z]', '', ext) ext = ext.lower() if ext not in ['jpg', 'jpeg', 'gif', 'png', 'bmp']: ext = 'jpg' y = url.split('/') h = hashlib.sha1(str(y[2])).hexdigest() hash_dir = os.path.join(h[0:1], h[1:2]) filename = image_guid + '.' + ext img_dir = os.path.join(krconfig.work_dir, 'data', 'images', hash_dir) fullname = os.path.join(img_dir, filename) if not os.path.exists(img_dir): os.makedirs(img_dir) localimage = 'images/%s/%s' % (hash_dir, filename) return localimage, fullname
def environ(request): """Converts a `tornado.httpserver.HTTPRequest` to a WSGI environment. """ hostport = request.host.split(":") if len(hostport) == 2: host = hostport[0] port = int(hostport[1]) else: host = request.host port = 443 if request.protocol == "https" else 80 environ = { "REQUEST_METHOD": request.method, "SCRIPT_NAME": "", "PATH_INFO": to_wsgi_str(escape.url_unescape(request.path, encoding=None)), "QUERY_STRING": request.query, "REMOTE_ADDR": request.remote_ip, "SERVER_NAME": host, "SERVER_PORT": str(port), "SERVER_PROTOCOL": request.version, "wsgi.version": (1, 0), "wsgi.url_scheme": request.protocol, "wsgi.input": BytesIO(escape.utf8(request.body)), "wsgi.errors": sys.stderr, "wsgi.multithread": False, "wsgi.multiprocess": True, "wsgi.run_once": False, } if "Content-Type" in request.headers: environ["CONTENT_TYPE"] = request.headers.pop("Content-Type") if "Content-Length" in request.headers: environ["CONTENT_LENGTH"] = request.headers.pop("Content-Length") for key, value in request.headers.items(): environ["HTTP_" + key.replace("-", "_").upper()] = value return environ
def resolve(self, host, port, family=0): # getHostByName doesn't accept IP addresses, so if the input # looks like an IP address just return it immediately. if twisted.internet.abstract.isIPAddress(host): resolved = host resolved_family = socket.AF_INET elif twisted.internet.abstract.isIPv6Address(host): resolved = host resolved_family = socket.AF_INET6 else: deferred = self.resolver.getHostByName(utf8(host)) resolved = yield gen.Task(deferred.addCallback) if twisted.internet.abstract.isIPAddress(resolved): resolved_family = socket.AF_INET elif twisted.internet.abstract.isIPv6Address(resolved): resolved_family = socket.AF_INET6 else: resolved_family = socket.AF_UNSPEC if family != socket.AF_UNSPEC and family != resolved_family: raise Exception('Requested socket family %d but got %d' % (family, resolved_family)) result = [ (resolved_family, (resolved, port)), ] raise gen.Return(result)
def down_image(self, url, referer=None, filename=None): """download image""" print "download: %s" % url, url = escape.utf8(url) image_guid = hashlib.sha1(url).hexdigest() convert_image = self.get_config('reader', 'convert_image') x = url.split('.') ext = 'jpg' if len(x) > 1: ext = x[-1] if len(ext) > 4: ext = ext[0:3] ext = re.sub('[^a-zA-Z]','', ext) ext = ext.lower() if ext not in ['jpg', 'jpeg', 'gif','png','bmp']: ext = 'jpg' y = url.split('/') h = hashlib.sha1(str(y[2])).hexdigest() hash_dir = os.path.join(h[0:1], h[1:2]) filename = image_guid + '.' + ext img_dir = os.path.join(self.work_dir, 'data', 'images', hash_dir) fullname = os.path.join(img_dir, filename) localimage = 'images/%s/%s' % (hash_dir, filename) if os.path.isfile(fullname) is False: if not os.path.exists(img_dir): os.makedirs( img_dir ) try: req = urllib2.Request(url) req.add_header('User-Agent', self.user_agent) req.add_header('Accept-Language', 'zh-cn,zh;q=0.7,nd;q=0.3') if referer is not None: req.add_header('Referer', referer) response = urllib2.urlopen(req) localFile = open(fullname, 'wb') localFile.write(response.read()) response.close() localFile.close() if convert_image : os.system('convert -colorspace gray \'%s[0]\' \ %s'% (fullname, fullname) ) print "done." except Exception, e: print 'fail: %s' % e localimage = False finally:
def down_image(self, url, referer=None, filename=None): """download image""" print "download: %s" % url, url = escape.utf8(url) image_guid = hashlib.sha1(url).hexdigest() convert_image = self.get_config('reader', 'convert_image') x = url.split('.') ext = 'jpg' if len(x) > 1: ext = x[-1] if len(ext) > 4: ext = ext[0:3] ext = re.sub('[^a-zA-Z]', '', ext) ext = ext.lower() if ext not in ['jpg', 'jpeg', 'gif', 'png', 'bmp']: ext = 'jpg' y = url.split('/') h = hashlib.sha1(str(y[2])).hexdigest() hash_dir = os.path.join(h[0:1], h[1:2]) filename = image_guid + '.' + ext img_dir = os.path.join(self.work_dir, 'data', 'images', hash_dir) fullname = os.path.join(img_dir, filename) localimage = 'images/%s/%s' % (hash_dir, filename) if os.path.isfile(fullname) is False: if not os.path.exists(img_dir): os.makedirs(img_dir) try: req = urllib2.Request(url) req.add_header('User-Agent', self.user_agent) req.add_header('Accept-Language', 'zh-cn,zh;q=0.7,nd;q=0.3') if referer is not None: req.add_header('Referer', referer) response = urllib2.urlopen(req) localFile = open(fullname, 'wb') localFile.write(response.read()) response.close() localFile.close() if convert_image: os.system('convert -colorspace gray \'%s[0]\' \ %s' % (fullname, fullname)) print "done." except Exception, e: print 'fail: %s' % e localimage = False finally:
def generate(self, writer): value = self.value # Compress lots of white space to a single character. If the whitespace # breaks a line, have it continue to break a line, but just with a # single \n character if writer.compress_whitespace and "<pre>" not in value: value = re.sub(r"([\t ]+)", " ", value) value = re.sub(r"(\s*\n\s*)", "\n", value) if value: writer.write_line('_append(%r)' % escape.utf8(value), self.line)
def __call__(self, request): data = {} response = [] def start_response(status, response_headers, exc_info=None): data["status"] = status data["headers"] = response_headers return response.append app_response = self.wsgi_application( WSGIContainer.environ(request), start_response) response.extend(app_response) body = b"".join(response) if hasattr(app_response, "close"): app_response.close() if not data: raise Exception("WSGI app did not call start_response") status_code = int(data["status"].split()[0]) headers = data["headers"] header_set = set(k.lower() for (k, v) in headers) body = escape.utf8(body) if status_code != 304: if "content-length" not in header_set: headers.append(("Content-Length", str(len(body)))) if "content-type" not in header_set: headers.append(("Content-Type", "text/html; charset=UTF-8")) if "server" not in header_set: headers.append(("Server", "TornadoServer/%s" % tornado.version)) parts = [escape.utf8("HTTP/1.1 " + data["status"] + "\r\n")] for key, value in headers: parts.append(escape.utf8(key) + b": " + escape.utf8(value) + b"\r\n") parts.append(b"\r\n") parts.append(body) request.write(b"".join(parts)) request.finish() self._log(status_code, request)
def parse_body_arguments(content_type, body, arguments, files): """Parses a form request body. Supports ``application/x-www-form-urlencoded`` and ``multipart/form-data``. The ``content_type`` parameter should be a string and ``body`` should be a byte string. The ``arguments`` and ``files`` parameters are dictionaries that will be updated with the parsed contents. """ if content_type.startswith("application/x-www-form-urlencoded"): uri_arguments = parse_qs_bytes(native_str(body), keep_blank_values=True) for name, values in uri_arguments.items(): if values: arguments.setdefault(name, []).extend(values) elif content_type.startswith("multipart/form-data"): fields = content_type.split(";") for field in fields: k, sep, v = field.strip().partition("=") if k == "boundary" and v: parse_multipart_form_data(utf8(v), body, arguments, files) break else: gen_log.warning("Invalid multipart/form-data")
def __init__(self, url, method="GET", headers=None, body=None, auth_username=None, auth_password=None, connect_timeout=None, request_timeout=None, if_modified_since=None, follow_redirects=None, max_redirects=None, user_agent=None, use_gzip=None, network_interface=None, streaming_callback=None, header_callback=None, prepare_curl_callback=None, proxy_host=None, proxy_port=None, proxy_username=None, proxy_password=None, allow_nonstandard_methods=None, validate_cert=None, ca_certs=None, allow_ipv6=None, client_key=None, client_cert=None): r"""All parameters except ``url`` are optional. :arg string url: URL to fetch :arg string method: HTTP method, e.g. "GET" or "POST" :arg headers: Additional HTTP headers to pass on the request :type headers: `~tornado.httputil.HTTPHeaders` or `dict` :arg string auth_username: Username for HTTP "Basic" authentication :arg string auth_password: Password for HTTP "Basic" authentication :arg float connect_timeout: Timeout for initial connection in seconds :arg float request_timeout: Timeout for entire request in seconds :arg if_modified_since: Timestamp for ``If-Modified-Since`` header :type if_modified_since: `datetime` or `float` :arg bool follow_redirects: Should redirects be followed automatically or return the 3xx response? :arg int max_redirects: Limit for ``follow_redirects`` :arg string user_agent: String to send as ``User-Agent`` header :arg bool use_gzip: Request gzip encoding from the server :arg string network_interface: Network interface to use for request :arg callable streaming_callback: If set, ``streaming_callback`` will be run with each chunk of data as it is received, and ``HTTPResponse.body`` and ``HTTPResponse.buffer`` will be empty in the final response. :arg callable header_callback: If set, ``header_callback`` will be run with each header line as it is received (including the first line, e.g. ``HTTP/1.0 200 OK\r\n``, and a final line containing only ``\r\n``. All lines include the trailing newline characters). ``HTTPResponse.headers`` will be empty in the final response. This is most useful in conjunction with ``streaming_callback``, because it's the only way to get access to header data while the request is in progress. :arg callable prepare_curl_callback: If set, will be called with a ``pycurl.Curl`` object to allow the application to make additional ``setopt`` calls. :arg string proxy_host: HTTP proxy hostname. To use proxies, ``proxy_host`` and ``proxy_port`` must be set; ``proxy_username`` and ``proxy_pass`` are optional. Proxies are currently only supported with ``curl_httpclient``. :arg int proxy_port: HTTP proxy port :arg string proxy_username: HTTP proxy username :arg string proxy_password: HTTP proxy password :arg bool allow_nonstandard_methods: Allow unknown values for ``method`` argument? :arg bool validate_cert: For HTTPS requests, validate the server's certificate? :arg string ca_certs: filename of CA certificates in PEM format, or None to use defaults. Note that in ``curl_httpclient``, if any request uses a custom ``ca_certs`` file, they all must (they don't have to all use the same ``ca_certs``, but it's not possible to mix requests with ``ca_certs`` and requests that use the defaults. :arg bool allow_ipv6: Use IPv6 when available? Default is false in ``simple_httpclient`` and true in ``curl_httpclient`` :arg string client_key: Filename for client SSL key, if any :arg string client_cert: Filename for client SSL certificate, if any """ if headers is None: headers = httputil.HTTPHeaders() if if_modified_since: headers["If-Modified-Since"] = httputil.format_timestamp( if_modified_since) self.proxy_host = proxy_host self.proxy_port = proxy_port self.proxy_username = proxy_username self.proxy_password = proxy_password self.url = url self.method = method self.headers = headers self.body = utf8(body) self.auth_username = auth_username self.auth_password = auth_password self.connect_timeout = connect_timeout self.request_timeout = request_timeout self.follow_redirects = follow_redirects self.max_redirects = max_redirects self.user_agent = user_agent self.use_gzip = use_gzip self.network_interface = network_interface self.streaming_callback = stack_context.wrap(streaming_callback) self.header_callback = stack_context.wrap(header_callback) self.prepare_curl_callback = stack_context.wrap(prepare_curl_callback) self.allow_nonstandard_methods = allow_nonstandard_methods self.validate_cert = validate_cert self.ca_certs = ca_certs self.allow_ipv6 = allow_ipv6 self.client_key = client_key self.client_cert = client_cert self.start_time = time.time()
def _curl_setup_request(curl, request, buffer, headers): curl.setopt(pycurl.URL, native_str(request.url)) # libcurl's magic "Expect: 100-continue" behavior causes delays # with servers that don't support it (which include, among others, # Google's OpenID endpoint). Additionally, this behavior has # a bug in conjunction with the curl_multi_socket_action API # (https://sourceforge.net/tracker/?func=detail&atid=100976&aid=3039744&group_id=976), # which increases the delays. It's more trouble than it's worth, # so just turn off the feature (yes, setting Expect: to an empty # value is the official way to disable this) if "Expect" not in request.headers: request.headers["Expect"] = "" # libcurl adds Pragma: no-cache by default; disable that too if "Pragma" not in request.headers: request.headers["Pragma"] = "" # Request headers may be either a regular dict or HTTPHeaders object if isinstance(request.headers, httputil.HTTPHeaders): curl.setopt(pycurl.HTTPHEADER, [native_str("%s: %s" % i) for i in request.headers.get_all()]) else: curl.setopt(pycurl.HTTPHEADER, [native_str("%s: %s" % i) for i in request.headers.items()]) if request.header_callback: curl.setopt(pycurl.HEADERFUNCTION, request.header_callback) else: curl.setopt(pycurl.HEADERFUNCTION, lambda line: _curl_header_callback(headers, line)) if request.streaming_callback: write_function = request.streaming_callback else: write_function = buffer.write if bytes_type is str: # py2 curl.setopt(pycurl.WRITEFUNCTION, write_function) else: # py3 # Upstream pycurl doesn't support py3, but ubuntu 12.10 includes # a fork/port. That version has a bug in which it passes unicode # strings instead of bytes to the WRITEFUNCTION. This means that # if you use a WRITEFUNCTION (which tornado always does), you cannot # download arbitrary binary data. This needs to be fixed in the # ported pycurl package, but in the meantime this lambda will # make it work for downloading (utf8) text. curl.setopt(pycurl.WRITEFUNCTION, lambda s: write_function(utf8(s))) curl.setopt(pycurl.FOLLOWLOCATION, request.follow_redirects) curl.setopt(pycurl.MAXREDIRS, request.max_redirects) curl.setopt(pycurl.CONNECTTIMEOUT_MS, int(1000 * request.connect_timeout)) curl.setopt(pycurl.TIMEOUT_MS, int(1000 * request.request_timeout)) if request.user_agent: curl.setopt(pycurl.USERAGENT, native_str(request.user_agent)) else: curl.setopt(pycurl.USERAGENT, "Mozilla/5.0 (compatible; pycurl)") if request.network_interface: curl.setopt(pycurl.INTERFACE, request.network_interface) if request.use_gzip: curl.setopt(pycurl.ENCODING, "gzip,deflate") else: curl.setopt(pycurl.ENCODING, "none") if request.proxy_host and request.proxy_port: curl.setopt(pycurl.PROXY, request.proxy_host) curl.setopt(pycurl.PROXYPORT, request.proxy_port) if request.proxy_username: credentials = '%s:%s' % (request.proxy_username, request.proxy_password) curl.setopt(pycurl.PROXYUSERPWD, credentials) else: curl.setopt(pycurl.PROXY, '') if request.validate_cert: curl.setopt(pycurl.SSL_VERIFYPEER, 1) curl.setopt(pycurl.SSL_VERIFYHOST, 2) else: curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) if request.ca_certs is not None: curl.setopt(pycurl.CAINFO, request.ca_certs) else: # There is no way to restore pycurl.CAINFO to its default value # (Using unsetopt makes it reject all certificates). # I don't see any way to read the default value from python so it # can be restored later. We'll have to just leave CAINFO untouched # if no ca_certs file was specified, and require that if any # request uses a custom ca_certs file, they all must. pass if request.allow_ipv6 is False: # Curl behaves reasonably when DNS resolution gives an ipv6 address # that we can't reach, so allow ipv6 unless the user asks to disable. # (but see version check in _process_queue above) curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # Set the request method through curl's irritating interface which makes # up names for almost every single method curl_options = { "GET": pycurl.HTTPGET, "POST": pycurl.POST, "PUT": pycurl.UPLOAD, "HEAD": pycurl.NOBODY, } custom_methods = set(["DELETE"]) for o in curl_options.values(): curl.setopt(o, False) if request.method in curl_options: curl.unsetopt(pycurl.CUSTOMREQUEST) curl.setopt(curl_options[request.method], True) elif request.allow_nonstandard_methods or request.method in custom_methods: curl.setopt(pycurl.CUSTOMREQUEST, request.method) else: raise KeyError('unknown method ' + request.method) # Handle curl's cryptic options for every individual HTTP method if request.method in ("POST", "PUT"): request_buffer = BytesIO(utf8(request.body)) curl.setopt(pycurl.READFUNCTION, request_buffer.read) if request.method == "POST": def ioctl(cmd): if cmd == curl.IOCMD_RESTARTREAD: request_buffer.seek(0) curl.setopt(pycurl.IOCTLFUNCTION, ioctl) curl.setopt(pycurl.POSTFIELDSIZE, len(request.body)) else: curl.setopt(pycurl.INFILESIZE, len(request.body)) if request.auth_username is not None: userpwd = "%s:%s" % (request.auth_username, request.auth_password or '') curl.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_BASIC) curl.setopt(pycurl.USERPWD, native_str(userpwd)) gen_log.debug("%s %s (username: %r)", request.method, request.url, request.auth_username) else: curl.unsetopt(pycurl.USERPWD) gen_log.debug("%s %s", request.method, request.url) if request.client_cert is not None: curl.setopt(pycurl.SSLCERT, request.client_cert) if request.client_key is not None: curl.setopt(pycurl.SSLKEY, request.client_key) if threading.activeCount() > 1: # libcurl/pycurl is not thread-safe by default. When multiple threads # are used, signals should be disabled. This has the side effect # of disabling DNS timeouts in some environments (when libcurl is # not linked against ares), so we don't do it when there is only one # thread. Applications that use many short-lived threads may need # to set NOSIGNAL manually in a prepare_curl_callback since # there may not be any other threads running at the time we call # threading.activeCount. curl.setopt(pycurl.NOSIGNAL, 1) if request.prepare_curl_callback is not None: request.prepare_curl_callback(curl)