class Response_Parser(): def __init__(self): self.parser = HttpParser() self.len_response = 0 self.len_body = 0 self.body = None def parse(self, raw_response): self.len_response = len(bytearray(raw_response)) self.parser.execute(raw_response, self.len_response) self.body = self.parser.recv_body() self.len_body = len(bytearray(self.body)) def get_all_keys(self): """Get All the key in request headers.""" return self.parser.get_headers().keys() def get_keys(self, *args): header_keys = {} for key in args: header_keys[key] = self.parser.get_headers().get(key, '-') return header_keys def get_reponse(self, *args): values = self.get_keys(*args) status_code = self.parser.get_status_code() obj = HTTP_Response(status_code, values, self.len_response, self.len_body) return obj def get_body(self): return self.body
def post_report(sock): st = os.statvfs(haystack_path) available_size = st.f_bavail * st.f_frsize obj = {} obj["listenip"] = listenip obj["listenport"] = listenport obj["disk_available_size"] = available_size obj["master"] = master obj["groupid"] = groupid obj["last_fileno"] = haystack.haystack_last_fileno body = json.dumps(obj) sock.send("POST /report HTTP/1.1\r\n") sock.send("Host: %s:%d\r\n" % (track.ip, track.port)) sock.send("Content-Length: %d\r\n" % len(body)) sock.send("Content-Type: application/json\r\n") sock.send("Connection: keep-alive\r\n") sock.send("\r\n") sock.send(body) parser = HttpParser() while True: data = sock.recv(1024) if not data: return False recved = len(data) nparsed = parser.execute(data, recved) assert nparsed == recved if parser.is_message_complete(): break return parser.get_status_code() == 200
def post_sync(sock, masterip, masterport): obj = {"last_fileno": haystack.haystack_last_fileno} body = json.dumps(obj) sock.send("POST /sync HTTP/1.1\r\n") sock.send("Host: %s:%d\r\n" % (masterip, masterport)) sock.send("Content-Length: %d\r\n" % len(body)) sock.send("Content-Type: application/json\r\n") sock.send("Connection: keep-alive\r\n") sock.send("\r\n") sock.send(body) parser = HttpParser() while True: #!!!ugly prevent recveive next http request data = sock.recv(1) if not data: return False recved = len(data) nparsed = parser.execute(data, recved) assert nparsed == recved if parser.is_message_complete(): break return parser.get_status_code() == 200
def __init__(self, raw): resp = HttpParser() resp.execute(raw.response, len(raw.response)) self.headers = resp.get_headers() self.body = "".join(resp._body) self.raw = raw self.code = resp.get_status_code() self._json = None
def request(self, method, url, headers={}, body=None, timeout=None): '''Issues a `method` request to `path` on the connected server. Sends along `headers`, and body. Very low level--you must set "host" yourself, for example. It will set Content-Length, however. ''' url_info = urlparse(url) fake_wsgi = dict( (cgi_name(n), v) for n, v in headers.iteritems()) fake_wsgi.update({ 'HTTP_METHOD' : method, 'SCRIPT_NAME' : '', 'PATH_INFO' : url_info[2], 'QUERY_STRING' : url_info[4], 'wsgi.version' : (1,0), 'wsgi.url_scheme' : 'http', # XXX incomplete 'wsgi.input' : cStringIO.StringIO(body or ''), 'wsgi.errors' : FileLikeErrorLogger(hlog), 'wsgi.multithread' : False, 'wsgi.multiprocess' : False, 'wsgi.run_once' : False, }) req = Request(fake_wsgi) timeout_handler = TimeoutHandler(timeout or 60) send('%s %s HTTP/1.1\r\n%s' % (req.method, req.url, str(req.headers))) if body: send(body) h = HttpParser() body = [] data = None while True: if data: used = h.execute(data, len(data)) if h.is_headers_complete(): body.append(h.recv_body()) if h.is_message_complete(): data = data[used:] break ev, val = first(receive_any=True, sleep=timeout_handler.remaining()) if ev == 'sleep': timeout_handler.timeout() data = val resp = Response( response=''.join(body), status=h.get_status_code(), headers=h.get_headers(), ) return resp
class Session: def __init__(self, current_key_hex, partner_key_hex): # self.body = [] self.parser = HttpParser(kind=2, decompress=True) self.data_bytes = 0 self.total_bytes = 0 self.current_key_hex = current_key_hex self.partner_key_hex = partner_key_hex self.is_request = None self.service = None def getPartner(self): return sessions[partner_key_hex] def getService(self): if (self.is_request == False): return self.getPartner().getService() if (self.is_request is None): return '_unknown' if (self.service is None): self.service = getServiceForQS(self.parser.get_query_string()) return self.service def eat(self, payload_string, bytes_sent): received_len = len(payload_string) self.data_bytes += received_len self.total_bytes += bytes_sent parsed_len = self.parser.execute(payload_string, received_len) # assert received_len == parsed_len # if self.parser.is_headers_complete(): # eprint(self.parser.get_headers()) # if self.parser.is_partial_body(): # self.body.append(self.parser.recv_body()) # if self.parser.is_message_complete(): # eprint("".join(self.body)) if self.parser.get_status_code() is not 0: self.is_request = False addBytesOutboundFromService(bytes_sent, self.getService()) # eprint(payload_string) elif self.parser.is_message_begin(): self.is_request = True addBytesInboundToService(bytes_sent, self.getService()) else: addBytesUnknownboundToService(bytes_sent, self.getService()) # if (self.parser.is_headers_complete() and not self.parser.is_message_complete()): # eprint("expected: %s, so far: %d" % (self.parser.get_headers().get('CONTENT-LENGTH'), self.data_bytes)) if self.parser.is_message_complete(): eprint("end!")
def process(indir, outdir): findstr = os.path.join(indir, '*') for fn in glob.glob(findstr): print fn with open(fn, 'rb') as f: http_bin = f.read() n = 0 while n < len(http_bin): http = HttpParser() nparsed = http.execute(http_bin[n:], len(http_bin) - n) if not http.is_message_complete(): break if http.get_path() != '': # send http_method = http_bin[n:].split()[ 0] #http.get_method() -- seems bugged http_path = http_bin[n:].split()[1] http_request = parse_http_packet(http.get_headers(), http.recv_body()) http_hostname = 'unknown' if 'Host' in http.get_headers(): http_hostname = http.get_headers()['Host'] print http_hostname nparsed -= 1 full_http = http_method + ' ' + http_path + '\n' full_http += http_request + '\n' save_http_packet(outdir, os.path.basename(fn), http_hostname, http_path, 'send', full_http) else: # recv http_status = http.get_status_code() http_reply = parse_http_packet(http.get_headers(), http.recv_body()) full_http += str(http_status) + '\n' full_http += http_reply save_http_packet(outdir, os.path.basename(fn), http_hostname, '', 'recv', full_http) n += nparsed
def heartbeat(sock): ip, port = sock.getpeername() parser = HttpParser() sock.send("GET /ping HTTP/1.1\r\nHost: %s:%d\r\n\r\n" % (ip, port)) while True: data = sock.recv(1024) if not data: return False recved = len(data) nparsed = parser.execute(data, recved) assert nparsed == recved if parser.is_message_complete(): break return parser.get_status_code() == 200
def makeRequest(self, host, url="/", port=80, method='GET', headers=None, postdata=None): assert self.e is not None evSet = self.e.wait() # noqa: F841 # log.debug("Generating raw http request") self.s.connect((host, port)) if headers is None: headers = { "Accept": "*/*", "User-Agent": self.useragent } req = self.rawHttpReq(host, url, method, headers, postdata) self.s.sendall(req.encode()) h = [] body = [] p = HttpParser() tlen = 0 while True: data = self.s.recv(2048) if not data: break rlen = len(data) tlen += rlen nparsed = p.execute(data, rlen) assert nparsed == rlen if p.is_headers_complete(): h = p.get_headers() # log.debug(p.get_headers()) if p.is_partial_body(): body.append(p.recv_body()) if p.is_message_complete(): break self.s.close() res = {'status': p.get_status_code(), 'length': tlen, 'headers': h, 'body': body, 'request': req} print(res)
def receive(self): h = HttpParser() body = [] data = None while True: if data: used = h.execute(data, len(data)) if h.is_headers_complete(): body.append(h.recv_body()) if h.is_message_complete(): data = data[used:] break data = self.s.recv(BUFSIZE) return Response(response=''.join(body), status=h.get_status_code(), headers=h.get_headers(), )
def parse_html(self): try: resolve_ip = '' data = [] fitler_list = ['*', '> ', '< ', '{'] for item in self.result.split("\n"): if 'Trying' in item: resolve_ip = item.replace('*', "").replace( "Trying", "").replace("...", "").strip() log.logger.info('resolve_ip: %s ' % (resolve_ip)) matching = [s for s in fitler_list if s in item[:2]] if len(matching) == 0: data.append(item.encode('utf-8')) parsing_string = b("\r\n").join(data) p = HttpParser() p.execute(parsing_string, len(parsing_string)) status_code = str(p.get_status_code()) header_obj = p.get_headers() #body = str(p.recv_body()) header_list = [] if resolve_ip: header_list.append('%s:%s' % ("resolve ip", resolve_ip.strip())) for key, value in header_obj.items(): header_list.append('%s:%s' % (key, value)) header = ("<br/>").join(header_list) body = self.content["result"] log.logger.info('resolve_ip :%s ' % (resolve_ip)) log.logger.info('status_code :%s ' % (status_code)) log.logger.info('header :%s ' % (header)) log.logger.info('body :%s ' % (body)) return status_code, header, body except Exception as e: log.logger.info('Exception: %s ' % (str(e))) return None, None, str(e)
class HttpStream(object): """ An HTTP parser providing higher-level access to a readable, sequential io.RawIOBase object. You can use implementions of http_parser.reader (IterReader, StringReader, SocketReader) or create your own. """ def __init__(self, stream, kind=HTTP_BOTH, decompress=False): """ constructor of HttpStream. :attr stream: an io.RawIOBase object :attr kind: Int, could be 0 to parseonly requests, 1 to parse only responses or 2 if we want to let the parser detect the type. """ self.parser = HttpParser(kind=kind, decompress=decompress) self.stream = stream def _check_headers_complete(self): if self.parser.is_headers_complete(): return while True: try: next(self) except StopIteration: if self.parser.is_headers_complete(): return raise NoMoreData("Can't parse headers") if self.parser.is_headers_complete(): return def _wait_status_line(self, cond): if self.parser.is_headers_complete(): return True data = [] if not cond(): while True: try: d = next(self) data.append(d) except StopIteration: if self.parser.is_headers_complete(): return True raise BadStatusLine(b"".join(data)) if cond(): return True return True def _wait_on_url(self): return self._wait_status_line(self.parser.get_url) def _wait_on_status(self): return self._wait_status_line(self.parser.get_status_code) def url(self): """ get full url of the request """ self._wait_on_url() return self.parser.get_url() def path(self): """ get path of the request (url without query string and fragment """ self._wait_on_url() return self.parser.get_path() def query_string(self): """ get query string of the url """ self._wait_on_url() return self.parser.get_query_string() def fragment(self): """ get fragment of the url """ self._wait_on_url() return self.parser.get_fragment() def version(self): self._wait_on_status() return self.parser.get_version() def status_code(self): """ get status code of a response as integer """ self._wait_on_status() return self.parser.get_status_code() def status(self): """ return complete status with reason """ status_code = self.status_code() reason = status_reasons.get(int(status_code), 'unknown') return "%s %s" % (status_code, reason) def method(self): """ get HTTP method as string""" self._wait_on_status() return self.parser.get_method() def headers(self): """ get request/response headers, headers are returned in a OrderedDict that allows you to get value using insensitive keys.""" self._check_headers_complete() return self.parser.get_headers() def should_keep_alive(self): """ return True if the connection should be kept alive """ self._check_headers_complete() return self.parser.should_keep_alive() def is_chunked(self): """ return True if Transfer-Encoding header value is chunked""" self._check_headers_complete() return self.parser.is_chunked() def wsgi_environ(self, initial=None): """ get WSGI environ based on the current request. :attr initial: dict, initial values to fill in environ. """ self._check_headers_complete() return self.parser.get_wsgi_environ() def body_file(self, buffering=None, binary=True, encoding=None, errors=None, newline=None): """ return the body as a buffered stream object. If binary is true an io.BufferedReader will be returned, else an io.TextIOWrapper. """ self._check_headers_complete() if buffering is None: buffering = -1 if buffering < 0: buffering = DEFAULT_BUFFER_SIZE raw = HttpBodyReader(self) buf = BufferedReader(raw, buffering) if binary: return buf text = TextIOWrapper(buf, encoding, errors, newline) return text def body_string(self, binary=True, encoding=None, errors=None, newline=None): """ return body as string """ return self.body_file(binary=binary, encoding=encoding, newline=newline).read() def __iter__(self): return self def __next__(self): if self.parser.is_message_complete(): raise StopIteration # fetch data b = bytearray(DEFAULT_BUFFER_SIZE) recved = self.stream.readinto(b) if recved is None: raise NoMoreData("no more data") del b[recved:] to_parse = bytes(b) # parse data nparsed = self.parser.execute(to_parse, recved) if nparsed != recved and not self.parser.is_message_complete(): raise ParserError("nparsed != recved (%s != %s) [%s]" % (nparsed, recved, bytes_to_str(to_parse))) if recved == 0: raise StopIteration return to_parse next = __next__
def request(self, method, url, headers=None, body=None, timeout=None): '''Issues a `method` request to `path` on the connected server. Sends along `headers`, and body. Very low level--you must set "host" yourself, for example. It will set Content-Length, however. ''' headers = headers or {} url_info = urlparse(url) fake_wsgi = dict( (cgi_name(n), str(v).strip()) for n, v in headers.iteritems()) if body and 'CONTENT_LENGTH' not in fake_wsgi: # If the caller hasn't set their own Content-Length but submitted # a body, we auto-set the Content-Length header here. fake_wsgi['CONTENT_LENGTH'] = str(len(body)) fake_wsgi.update({ 'REQUEST_METHOD' : method, 'SCRIPT_NAME' : '', 'PATH_INFO' : url_info[2], 'QUERY_STRING' : url_info[4], 'wsgi.version' : (1,0), 'wsgi.url_scheme' : 'http', # XXX incomplete 'wsgi.input' : cStringIO.StringIO(body or ''), 'wsgi.errors' : FileLikeErrorLogger(hlog), 'wsgi.multithread' : False, 'wsgi.multiprocess' : False, 'wsgi.run_once' : False, }) req = Request(fake_wsgi) timeout_handler = TimeoutHandler(timeout or 60) url = str(req.path) if req.query_string: url += '?' + str(req.query_string) send('%s %s HTTP/1.1\r\n%s' % (req.method, url, str(req.headers))) if body: send(body) h = HttpParser() body = [] data = None while True: if data: used = h.execute(data, len(data)) if h.is_headers_complete(): body.append(h.recv_body()) if h.is_message_complete(): data = data[used:] break ev, val = first(receive_any=True, sleep=timeout_handler.remaining()) if ev == 'sleep': timeout_handler.timeout() data = val resp = Response( response=''.join(body), status=h.get_status_code(), headers=h.get_headers(), ) return resp
class HttpStream(object): """ An HTTP parser providing higher-level access to a readable, sequential io.RawIOBase object. You can use implementions of http_parser.reader (IterReader, StringReader, SocketReader) or create your own. """ def __init__(self, stream, kind=HTTP_BOTH, decompress=False): """ constructor of HttpStream. :attr stream: an io.RawIOBase object :attr kind: Int, could be 0 to parseonly requests, 1 to parse only responses or 2 if we want to let the parser detect the type. """ self.parser = HttpParser(kind=kind, decompress=decompress) self.stream = stream def _check_headers_complete(self): if self.parser.is_headers_complete(): return while True: try: data = self.next() except StopIteration: if self.parser.is_headers_complete(): return raise NoMoreData() if self.parser.is_headers_complete(): return def url(self): """ get full url of the request """ self._check_headers_complete() return self.parser.get_url() def path(self): """ get path of the request (url without query string and fragment """ self._check_headers_complete() return self.parser.get_path() def query_string(self): """ get query string of the url """ self._check_headers_complete() return self.parser.get_query_string() def fragment(self): """ get fragment of the url """ self._check_headers_complete() return self.parser.get_fragment() def version(self): self._check_headers_complete() return self.parser.get_version() def status_code(self): """ get status code of a response as integer """ self._check_headers_complete() return self.parser.get_status_code() def status(self): """ return complete status with reason """ status_code = self.status_code() reason = status_reasons.get(int(status_code), 'unknown') return "%s %s" % (status_code, reason) def method(self): """ get HTTP method as string""" self._check_headers_complete() return self.parser.get_method() def headers(self): """ get request/response headers, headers are returned in a OrderedDict that allows you to get value using insensitive keys.""" self._check_headers_complete() return self.parser.get_headers() def should_keep_alive(self): """ return True if the connection should be kept alive """ self._check_headers_complete() return self.parser.should_keep_alive() def is_chunked(self): """ return True if Transfer-Encoding header value is chunked""" self._check_headers_complete() return self.parser.is_chunked() def wsgi_environ(self, initial=None): """ get WSGI environ based on the current request. :attr initial: dict, initial values to fill in environ. """ self._check_headers_complete() return self.parser.get_wsgi_environ() def body_file(self, buffering=None, binary=True, encoding=None, errors=None, newline=None): """ return the body as a buffered stream object. If binary is true an io.BufferedReader will be returned, else an io.TextIOWrapper. """ self._check_headers_complete() if buffering is None: buffering = -1 if buffering < 0: buffering = DEFAULT_BUFFER_SIZE raw = HttpBodyReader(self) buffer = BufferedReader(raw, buffering) if binary: return buffer text = TextIOWrapper(buffer, encoding, errors, newline) return text def body_string(self, binary=True, encoding=None, errors=None, newline=None): """ return body as string """ return self.body_file(binary=binary, encoding=encoding, newline=newline).read() def __iter__(self): return self def next(self): if self.parser.is_message_complete(): raise StopIteration # fetch data b = bytearray(DEFAULT_BUFFER_SIZE) recved = self.stream.readinto(b) if recved is None: raise NoMoreData("no more data") del b[recved:] # parse data nparsed = self.parser.execute(bytes(b), recved) if nparsed != recved and not self.parser.is_message_complete(): raise ParserError("nparsed != recved") if recved == 0: raise StopIteration return bytes(b)
def request(self, method, url, headers=None, body=None, timeout=None): """Issues a `method` request to `path` on the connected server. Sends along `headers`, and body. Very low level--you must set "host" yourself, for example. It will set Content-Length, however. """ headers = headers or {} url_info = urlparse(url) fake_wsgi = dict((cgi_name(n), str(v).strip()) for n, v in headers.iteritems()) if body and "CONTENT_LENGTH" not in fake_wsgi: # If the caller hasn't set their own Content-Length but submitted # a body, we auto-set the Content-Length header here. fake_wsgi["CONTENT_LENGTH"] = str(len(body)) fake_wsgi.update( { "REQUEST_METHOD": method, "SCRIPT_NAME": "", "PATH_INFO": url_info[2], "QUERY_STRING": url_info[4], "wsgi.version": (1, 0), "wsgi.url_scheme": "http", # XXX incomplete "wsgi.input": cStringIO.StringIO(body or ""), "wsgi.errors": FileLikeErrorLogger(hlog), "wsgi.multithread": False, "wsgi.multiprocess": False, "wsgi.run_once": False, } ) req = Request(fake_wsgi) timeout_handler = TimeoutHandler(timeout or 60) url = str(req.path) if req.query_string: url += "?" + str(req.query_string) send("%s %s HTTP/1.1\r\n%s" % (req.method, url, str(req.headers))) if body: send(body) h = HttpParser() body = [] data = None while True: if data: used = h.execute(data, len(data)) if h.is_headers_complete(): body.append(h.recv_body()) if h.is_message_complete(): data = data[used:] break ev, val = first(receive_any=True, sleep=timeout_handler.remaining()) if ev == "sleep": timeout_handler.timeout() data = val resp = Response(response="".join(body), status=h.get_status_code(), headers=h.get_headers()) return resp
class MitmProtocol(asyncio.Protocol): ''' Handles details of MITMing a TLS connection. ''' def __init__(self, loop, http_version, proxy): ''' Constructor. ''' self._http_version = http_version self._loop = loop self._parser = HttpParser() self._proxy = proxy self._received = asyncio.Future() self._body = b'' def connection_made(self, transport): ''' Save a reference to the transport. ''' log.debug('MITM connection opened.') self._transport = transport cert = self._transport.get_extra_info('peercert') log.debug('MITM upstream certificate: {}'.format(cert)) self._loop.call_soon(self._proxy.start_tls, self._http_version) def connection_lost(self, exc): log.debug('MITM connection closed.') self._received.cancel() def data_received(self, data): ''' Accumulate request data. ''' log.debug('MITM data received: {}'.format(data)) self._parser.execute(data, len(data)) if self._parser.is_partial_body(): self._body += self._parser.recv_body() if self._parser.is_message_complete(): version = self._parser.get_version() status = self._parser.get_status_code() reason = None # For some reason, the parser doesn't expose this :( headers = self._parser.get_headers() log.debug('MITM upstream status: {}'.format(status)) log.debug('MITM upstream headers: {}'.format(headers)) log.debug('MITM upstream body: {}...'.format(self._body[:1000])) self._received.set_result( (version, status, reason, headers, self._body) ) self._transport.close() def forward(self, data): ''' Forward data to upstream host. ''' log.debug('MITM sending data: {}'.format(data)) self._transport.write(data) @asyncio.coroutine def receive(self): ''' Read data received by this MITM instance. ''' response = yield from self._received return response
async def fetch( url: str, method: str = "GET", headers=None, body: Optional[bytes] = None, connect_timeout=DEFAULT_CONNECT_TIMEOUT, request_timeout=DEFAULT_REQUEST_TIMEOUT, resolver=resolve, max_buffer_size=DEFAULT_BUFFER_SIZE, follow_redirects: bool = False, max_redirects=DEFAULT_MAX_REDIRECTS, validate_cert=config.http_client.validate_certs, allow_proxy: bool = False, proxies=None, user: Optional[str] = None, password: Optional[str] = None, content_encoding: Optional[str] = None, eof_mark: Optional[bytes] = None, ) -> Tuple[int, Dict[str, Any], bytes]: """ :param url: Fetch URL :param method: request method "GET", "POST", "PUT" etc :param headers: Dict of additional headers :param body: Request body for POST and PUT request :param connect_timeout: :param request_timeout: :param resolver: :param follow_redirects: :param max_redirects: :param validate_cert: :param allow_proxy: :param proxies: :param user: :param password: :param max_buffer_size: :param content_encoding: :param eof_mark: Do not consider connection reset as error if eof_mark received (string or list) :return: code, headers, body """ def get_connect_options(): opts = {} if use_tls and not proxy: ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) if validate_cert: ctx.check_hostname = True ctx.verify_mode = ssl.CERT_REQUIRED else: ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE opts["ssl"] = ctx return opts metrics["httpclient_requests", ("method", method.lower())] += 1 # if eof_mark: eof_mark = smart_bytes(eof_mark) # Detect proxy when necessary u = urlparse(str(url)) use_tls = u.scheme == "https" proto = "HTTPS" if use_tls else "HTTP" logger.debug("%s %s %s", proto, method, url) if ":" in u.netloc: host, port = u.netloc.rsplit(":") port = int(port) else: host = u.netloc port = DEFAULT_PORTS.get(u.scheme) if not port: return ERR_TIMEOUT, {}, b"Cannot resolve port for scheme: %s" % smart_bytes( u.scheme) if is_ipv4(host): addr = host else: addr = await resolver(host) if not addr: return ERR_TIMEOUT, {}, "Cannot resolve host: %s" % host # Detect proxy server if allow_proxy: proxy = (proxies or SYSTEM_PROXIES).get(u.scheme) else: proxy = None # Connect reader, writer = None, None if proxy: connect_address = proxy elif isinstance(addr, tuple): connect_address = addr else: connect_address = (addr, port) try: try: if proxy: logger.debug("Connecting to proxy %s:%s", connect_address[0], connect_address[1]) reader, writer = await asyncio.wait_for( asyncio.open_connection(connect_address[0], connect_address[1], **get_connect_options()), connect_timeout, ) except ConnectionRefusedError: metrics["httpclient_timeouts"] += 1 return ERR_TIMEOUT, {}, b"Connection refused" except OSError as e: metrics["httpclient_timeouts"] += 1 return ERR_TIMEOUT, {}, b"Connection error: %s" % smart_bytes(e) except asyncio.TimeoutError: metrics["httpclient_timeouts"] += 1 return ERR_TIMEOUT, {}, b"Connection timed out" # Proxy CONNECT if proxy: logger.debug("Sending CONNECT %s:%s", addr, port) # Send CONNECT request req = b"CONNECT %s:%s HTTP/1.1\r\nUser-Agent: %s\r\n\r\n" % ( smart_bytes(addr), smart_bytes(port), smart_bytes(DEFAULT_USER_AGENT), ) writer.write(smart_bytes(req)) try: await asyncio.wait_for(writer.drain(), request_timeout) except asyncio.TimeoutError: metrics["httpclient_proxy_timeouts"] += 1 return ERR_TIMEOUT, {}, b"Timed out while sending request to proxy" # Wait for proxy response parser = HttpParser() while not parser.is_headers_complete(): try: data = await asyncio.wait_for(reader.read(max_buffer_size), request_timeout) except asyncio.TimeoutError: metrics["httpclient_proxy_timeouts"] += 1 return ERR_TIMEOUT, {}, b"Timed out while sending request to proxy" received = len(data) parsed = parser.execute(data, received) if parsed != received: return ERR_PARSE_ERROR, {}, b"Parse error" code = parser.get_status_code() logger.debug("Proxy response: %s", code) if not 200 <= code <= 299: return code, parser.get_headers(), "Proxy error: %s" % code # Process request body = body or "" content_type = "application/binary" if not isinstance(body, (str, bytes)): body = smart_text(orjson.dumps(body)) content_type = "text/json" body = smart_bytes(body) # Here and below body is binary h = { "Host": str(u.netloc), "Connection": "close", "User-Agent": DEFAULT_USER_AGENT } if body and content_encoding: if content_encoding == CE_DEFLATE: # Deflate compression h["Content-Encoding"] = CE_DEFLATE compress = zlib.compressobj( zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, zlib.Z_DEFAULT_STRATEGY, ) body = compress.compress(body) + compress.flush() elif content_encoding == CE_GZIP: # gzip compression h["Content-Encoding"] = CE_GZIP compress = zlib.compressobj(6, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0) crc = zlib.crc32(body, 0) & 0xFFFFFFFF body = b"\x1f\x8b\x08\x00%s\x02\xff%s%s%s%s" % ( to32u(int(time.time())), compress.compress(body), compress.flush(), to32u(crc), to32u(len(body)), ) if method in REQUIRE_LENGTH_METHODS: h["Content-Length"] = str(len(body)) h["Content-Type"] = content_type if user and password: # Include basic auth header uh = smart_text("%s:%s" % (user, password)) h["Authorization"] = b"Basic %s" % codecs.encode( uh.encode("utf-8"), "base64").strip() if headers: h.update(headers) path = u.path if u.query: path += "?%s" % u.query req = b"%s %s HTTP/1.1\r\n%s\r\n\r\n%s" % ( smart_bytes(method), smart_bytes(path), b"\r\n".join(b"%s: %s" % (smart_bytes(k), smart_bytes(h[k])) for k in h), body, ) try: writer.write(req) await asyncio.wait_for(writer.drain(), request_timeout) except ConnectionResetError: metrics["httpclient_timeouts"] += 1 return ERR_TIMEOUT, {}, b"Connection reset while sending request" except asyncio.TimeoutError: metrics["httpclient_timeouts"] += 1 return ERR_TIMEOUT, {}, b"Timed out while sending request" parser = HttpParser() response_body: List[bytes] = [] while not parser.is_message_complete(): try: data = await asyncio.wait_for(reader.read(max_buffer_size), request_timeout) is_eof = not data except (asyncio.IncompleteReadError, ConnectionResetError): is_eof = True except asyncio.TimeoutError: metrics["httpclient_timeouts"] += 1 return ERR_READ_TIMEOUT, {}, b"Request timed out" if is_eof: if eof_mark and response_body: # Check if EOF mark is in received data response_body = [b"".join(response_body)] if isinstance(eof_mark, str): if eof_mark in response_body[0]: break else: found = False for m in eof_mark: if m in response_body[0]: found = True break if found: break metrics["httpclient_timeouts"] += 1 return ERR_READ_TIMEOUT, {}, b"Connection reset" received = len(data) parsed = parser.execute(data, received) if parsed != received: return ERR_PARSE_ERROR, {}, b"Parse error" if parser.is_partial_body(): response_body += [parser.recv_body()] code = parser.get_status_code() parsed_headers = parser.get_headers() logger.debug("HTTP Response %s", code) if 300 <= code <= 399 and follow_redirects: # Process redirects if max_redirects > 0: new_url = parsed_headers.get("Location") if not new_url: return ERR_PARSE_ERROR, {}, b"No Location header" logger.debug("HTTP redirect %s %s", code, new_url) return await fetch( new_url, method="GET", headers=headers, connect_timeout=connect_timeout, request_timeout=request_timeout, resolver=resolver, max_buffer_size=max_buffer_size, follow_redirects=follow_redirects, max_redirects=max_redirects - 1, validate_cert=validate_cert, allow_proxy=allow_proxy, proxies=proxies, ) else: return 404, {}, b"Redirect limit exceeded" # @todo: Process gzip and deflate Content-Encoding return code, parsed_headers, b"".join(response_body) finally: if writer: writer.close() try: await writer.wait_closed() except ConnectionResetError: pass
class TitleFetcher: status_code = 0 followed_times = 0 # 301, 302 finder = None addr = None stream = None max_follows = 10 timeout = 15 _finished = False _cookie = None _connected = False _redirected_stream = None _content_finders = (TitleFinder, PNGFinder, JPEGFinder, GIFFinder) _url_finders = () def __init__( self, url, callback, timeout=None, max_follows=None, io_loop=None, content_finders=None, url_finders=None, referrer=None, run_at_init=True, ): ''' url: the (full) url to fetch callback: called with title or MediaType or an instance of SingletonFactory timeout: total time including redirection before giving up max_follows: max redirections may raise: <UnicodeError: label empty or too long> in host preparation ''' self._callback = callback self.referrer = referrer if max_follows is not None: self.max_follows = max_follows if timeout is not None: self.timeout = timeout if hasattr(tornado.ioloop, 'current'): default_io_loop = tornado.ioloop.IOLoop.current else: default_io_loop = tornado.ioloop.IOLoop.instance self.io_loop = io_loop or default_io_loop() if content_finders is not None: self._content_finders = content_finders if url_finders is not None: self._url_finders = url_finders self.origurl = url self.url_visited = [] if run_at_init: self.run() def run(self): if self.url_visited: raise Exception("can't run again") else: self.start_time = self.io_loop.time() self._timeout = self.io_loop.add_timeout( self.timeout + self.start_time, self.on_timeout, ) try: self.new_url(self.origurl) except: self.io_loop.remove_timeout(self._timeout) raise def on_timeout(self): logger.debug('%s: request timed out', self.origurl) self.run_callback(Timeout) def parse_url(self, url): '''parse `url`, set self.host and return address and stream class''' self.url = u = urlsplit(url) self.host = u.netloc if u.scheme == 'http': addr = u.hostname, u.port or 80 stream = tornado.iostream.IOStream elif u.scheme == 'https': addr = u.hostname, u.port or 443 stream = tornado.iostream.SSLIOStream else: raise ValueError('bad url: %r' % url) return addr, stream def new_connection(self, addr, StreamClass): '''set self.addr, self.stream and connect to host''' s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.addr = addr self.stream = StreamClass(s) logger.debug('%s: connecting to %s...', self.origurl, addr) self.stream.set_close_callback(self.before_connected) self.stream.connect(addr, self.send_request) def new_url(self, url): self.url_visited.append(url) self.fullurl = url for finder in self._url_finders: f = finder.match_url(url, self) if f: self.finder = f f() return addr, StreamClass = self.parse_url(url) if addr != self.addr: if self.stream: self.stream.close() self.new_connection(addr, StreamClass) else: logger.debug('%s: try to reuse existing connection to %s', self.origurl, self.addr) try: self.send_request(nocallback=True) except tornado.iostream.StreamClosedError: logger.debug( '%s: server at %s doesn\'t like keep-alive, will reconnect.', self.origurl, self.addr) # The close callback should have already run self.stream.close() self.new_connection(addr, StreamClass) def run_callback(self, arg): self.io_loop.remove_timeout(self._timeout) self._finished = True if self.stream: self.stream.close() self._callback(arg, self) def send_request(self, nocallback=False): self._connected = True req = [ 'GET %s HTTP/1.1', 'Host: %s', # t.co will return 200 and use js/meta to redirect using the following :-( # 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0', 'User-Agent: %s' % UserAgent, 'Accept: text/html,application/xhtml+xml;q=0.9,*/*;q=0.7', 'Accept-Language: zh-cn,zh;q=0.7,en;q=0.3', 'Accept-Charset: utf-8,gb18030;q=0.7,*;q=0.7', 'Accept-Encoding: gzip, deflate', 'Connection: keep-alive', ] if self.referrer is not None: req.append('Referer: ' + self.referrer.replace('%', '%%')) path = self.url.path or '/' if self.url.query: path += '?' + self.url.query req = '\r\n'.join(req) % ( path, self._prepare_host(self.host), ) if self._cookie: req += '\r\n' + self._cookie req += '\r\n\r\n' self.stream.write(req.encode()) self.headers_done = False self.parser = HttpParser(decompress=True) if not nocallback: self.stream.read_until_close( # self.addr and self.stream may have been changed when close callback is run partial(self.on_data, close=True, addr=self.addr, stream=self.stream), streaming_callback=self.on_data, ) def _prepare_host(self, host): host = encodings.idna.nameprep(host) return b'.'.join( encodings.idna.ToASCII(x) if x else b'' for x in host.split('.')).decode('ascii') def on_data(self, data, close=False, addr=None, stream=None): if close: logger.debug('%s: connection to %s closed.', self.origurl, addr) if self.stream.error: self.run_callback(self.stream.error) return if (close and stream and self._redirected_stream is stream) or self._finished: # The connection is closing, and we are being redirected or we're done. self._redirected_stream = None return recved = len(data) logger.debug('%s: received data: %d bytes', self.origurl, recved) p = self.parser nparsed = p.execute(data, recved) if close: # feed EOF p.execute(b'', 0) if not self.headers_done and p.is_headers_complete(): if not self.on_headers_done(): return if p.is_partial_body(): chunk = p.recv_body() if self.finder is None: # redirected but has body received return t = self.feed_finder(chunk) if t is not None: self.run_callback(t) return if p.is_message_complete(): if self.finder is None: # redirected but has body received return t = self.feed_finder(None) # if title not found, t is None self.run_callback(t) elif close: self.run_callback(self.stream.error or ConnectionClosed) def before_connected(self): '''check if something wrong before connected''' if not self._connected and not self._finished: self.run_callback(self.stream.error) def process_cookie(self): setcookie = self.headers.get('Set-Cookie', None) if not setcookie: return cookies = [ c.rsplit(None, 1)[-1] for c in setcookie.split('; expires')[:-1] ] self._cookie = 'Cookie: ' + '; '.join(cookies) def on_headers_done(self): '''returns True if should proceed, None if should stop for current chunk''' self.headers_done = True self.headers = self.parser.get_headers() self.status_code = self.parser.get_status_code() if self.status_code in (301, 302): self.process_cookie() # or we may be redirecting to a loop logger.debug('%s: redirect to %s', self.origurl, self.headers['Location']) self.followed_times += 1 if self.followed_times > self.max_follows: self.run_callback(TooManyRedirection) else: newurl = urljoin(self.fullurl, self.headers['Location']) self._redirected_stream = self.stream self.new_url(newurl) return try: l = int(self.headers.get('Content-Length', None)) except (ValueError, TypeError): l = None ctype = self.headers.get('Content-Type', 'text/html') mt = defaultMediaType._replace(type=ctype, size=l) for finder in self._content_finders: f = finder.match_type(mt) if f: self.finder = f break else: self.run_callback(mt) return return True def feed_finder(self, chunk): '''feed data to finder, return the title if found''' t = self.finder(chunk) if t is not None: return t
class TitleFetcher: default_charset = 'UTF-8' meta_charset = re.compile(br'<meta\s+http-equiv="?content-type"?\s+content="?[^;]+;\s*charset=([^">]+)"?\s*/?>|<meta\s+charset="?([^">/"]+)"?\s*/?>', re.IGNORECASE) charset = None status_code = 0 followed_times = 0 # 301, 302 addr = None stream = None max_follows = 10 timeout = 15 _finished = False _cookie = None _connected = False _redirecting = False def __init__(self, url, callback, timeout=None, max_follows=None, io_loop=None): ''' url: the (full) url to fetch callback: called with title or MediaType or an instance of SingletonFactory timeout: total time including redirection before giving up max_follows: max redirections ''' self._callback = callback if max_follows is not None: self.max_follows = max_follows if timeout is not None: self.timeout = timeout self.io_loop = io_loop or tornado.ioloop.IOLoop.instance() self.start_time = self.io_loop.time() self._timeout = self.io_loop.add_timeout( self.timeout + self.start_time, self.on_timeout, ) self.origurl = url self.new_url(url) def on_timeout(self): self.run_callback(Timeout) def parse_url(self, url): '''parse `url`, set self.host and return address and stream class''' self.url = u = urlsplit(url) self.host = u.netloc if u.scheme == 'http': addr = u.hostname, u.port or 80 stream = tornado.iostream.IOStream elif u.scheme == 'https': addr = u.hostname, u.port or 443 stream = tornado.iostream.SSLIOStream else: raise ValueError('bad url: %r' % url) return addr, stream def new_connection(self, addr, StreamClass): '''set self.addr, self.stream and connect to host''' s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.addr = addr self.stream = StreamClass(s) logger.debug('%s: connecting to %s...', self.origurl, addr) self.stream.set_close_callback(self.before_connected) self.stream.connect(addr, self.send_request) def new_url(self, url): self.fullurl = url addr, StreamClass = self.parse_url(url) if addr != self.addr: if self.stream: self.stream.close() self.new_connection(addr, StreamClass) else: logger.debug('%s: try to reuse existing connection to %s', self.origurl, self.addr) try: self.send_request(nocallback=True) except tornado.iostream.StreamClosedError: logger.debug('%s: server at %s doesn\'t like keep-alive, will reconnect.', self.origurl, self.addr) # The close callback should have already run self.stream.close() self.new_connection(addr, StreamClass) def run_callback(self, arg): self.io_loop.remove_timeout(self._timeout) self._finished = True self.stream.close() self._callback(arg, self) def send_request(self, nocallback=False): self._connected = True req = ('GET %s HTTP/1.1', 'Host: %s', # t.co will return 200 and use js/meta to redirect using the following :-( # 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0', 'User-Agent: FetchTitle/1.0', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.7', 'Accept-Language: zh-cn,zh;q=0.7,en;q=0.3', 'Accept-Charset: utf-8,gb18030;q=0.7,*;q=0.7', 'Accept-Encoding: gzip, deflate', 'Connection: keep-alive', ) path = self.url.path or '/' if self.url.query: path += '?' + self.url.query req = '\r\n'.join(req) % ( path, self.host, ) if self._cookie: req += '\r\n' + self._cookie req += '\r\n\r\n' self.stream.write(req.encode()) self.headers_done = False self.parser = HttpParser(decompress=True) if not nocallback: self.stream.read_until_close( # self.addr will have been changed when close callback is run partial(self.on_data, close=True, addr=self.addr), streaming_callback=self.on_data, ) def on_data(self, data, close=False, addr=None): if close: logger.debug('%s: connection to %s closed.', self.origurl, addr) if (close and self._redirecting) or self._finished: # The connection is closing, and we are being redirected or we're done. self._redirecting = False return recved = len(data) logger.debug('%s: received data: %d bytes', self.origurl, recved) p = self.parser nparsed = p.execute(data, recved) if close: # feed EOF p.execute(b'', 0) if not self.headers_done and p.is_headers_complete(): if not self.on_headers_done(): return if p.is_partial_body(): chunk = p.recv_body() if not self.charset: m = self.meta_charset.search(chunk) if m: self.charset = (m.group(1) or m.group(2)).decode('latin1') t = self.feed_finder(chunk) if t: self.run_callback(t) return if p.is_message_complete(): t = self.feed_finder(None) # if title not found, t is None self.run_callback(t) elif close: self.run_callback(self.stream.error or ConnectionClosed) def before_connected(self): '''check if something wrong before connected''' if not self._connected and not self._finished: self.run_callback(self.stream.error) def process_cookie(self): setcookie = self.headers.get('Set-Cookie', None) if not setcookie: return cookies = [c.rsplit(None, 1)[-1] for c in setcookie.split('; expires')[:-1]] self._cookie = 'Cookie: ' + '; '.join(cookies) def on_headers_done(self): '''returns True if should proceed, None if should stop for current chunk''' self.headers_done = True self.headers = self.parser.get_headers() self.status_code = self.parser.get_status_code() if self.status_code in (301, 302): self.process_cookie() # or we may be redirecting to a loop logger.debug('%s: redirect to %s', self.origurl, self.headers['Location']) self.followed_times += 1 if self.followed_times > self.max_follows: self.run_callback(TooManyRedirection) else: newurl = urljoin(self.fullurl, self.headers['Location']) self._redirecting = True self.new_url(newurl) return ctype = self.headers.get('Content-Type', 'text/html') if ctype.find('html') == -1: try: l = int(self.headers.get('Content-Length', None)) except (ValueError, TypeError): l = None mt = defaultMediaType._replace(type=ctype, size=l) ctype = ctype.split(';', 1)[0] if ctype == 'image/png': self.finder = PNGFinder(mt) elif ctype == 'image/jpeg': self.finder = JPEGFinder(mt) elif ctype == 'image/gif': self.finder = GIFFinder(mt) else: self.run_callback(mt) return else: self.finder = TitleFinder() pos = ctype.find('charset=') if pos > 0: self.charset = ctype[pos+8:] return True def feed_finder(self, chunk): '''feed data to TitleFinder, return the title if found''' t = self.finder(chunk) if t: if self.charset is None: self.charset = self.default_charset if isinstance(t, bytes): try: title = replaceEntities(t.decode(self.charset)) return title except (UnicodeDecodeError, LookupError): return t else: return t
class HttpProxyProtocol(asyncio.Protocol): ''' Implement HTTP(S) proxy behavior. ''' def __init__(self, loop, config, token_store): ''' Constructor. ''' self._parser = HttpParser() self._body = b'' self._config = config self._loop = loop self._mitm = None self._mitm_host = None self._token_store = token_store self._instagram = InstagramApi( client_id=config['Instagram']['ClientID'], client_secret=config['Instagram']['ClientSecret'], ) self._twitter = TwitterApi( consumer_key=config['Twitter']['ConsumerKey'], consumer_secret=config['Twitter']['ConsumerSecret'], app_token=config['Twitter']['AppToken'], app_token_secret=config['Twitter']['AppTokenSecret'], ) def connection_made(self, transport): ''' Save a reference to the transport so that we can send a reply. ''' log.debug('Connection opened.') self._transport = transport def connection_lost(self, exc): log.debug('Connection closed.') def data_received(self, data): ''' Parse incoming HTTP request. ''' log.debug('Data received: {}'.format(data)) self._parser.execute(data, len(data)) if self._parser.is_partial_body(): self._body += self._parser.recv_body() if self._parser.is_message_complete(): method = self._parser.get_method() uri = self._parser.get_url() version = self._parser.get_version() headers = self._parser.get_headers() content_type = headers.get('Content-type', '') charset = _get_charset(content_type) body = self._body.decode(charset) log.debug('Client charset: {}'.format(charset)) log.debug('Client status: method={} uri={} version={}' \ .format(method, uri, version)) log.debug('Client headers: {}'.format(headers)) log.debug('Client body: {}...'.format(body[:1000])) if method == 'CONNECT': asyncio.async(self._start_mitm(uri, version)) self._parser = HttpParser() else: asyncio.async( self._request_upstream( method, uri, version, headers, body ) ) def start_tls(self, version): ''' Initiate TLS session with the client. This part is completely hacky! We mess around with the transport's internals in order to wrap the current transport in TLS. Python doesn't have an official way to do this, although it *might* get fixed in 3.6: http://bugs.python.org/issue23749 ''' log.debug('The proxy is starting TLS with its client.') status_line = 'HTTP/{}.{} {} {}\r\n\r\n' \ .format(version[0], version[1], 200, 'OK') self._transport.write(status_line.encode('ascii')) ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) ssl_context.set_ciphers('HIGH:!aNull:!eNull') ssl_context.load_cert_chain('ssl/server.crt', 'ssl/server.key') original_socket = self._transport._sock self._transport = self._loop._make_ssl_transport( original_socket, self, ssl_context, server_side=True ) @asyncio.coroutine def _request_upstream(self, method, uri, version, headers, body): ''' Forward the request to the upstream server. ''' try: yield from self._request_upstream_helper( method, uri, version, headers, body ) except Exception: charset = _get_charset(headers.get('Content-type', '')) response = render_http_response( version, 500, 'PROXY ERROR', {'Content-type': 'text/plain; charset={}'.format(charset)}, traceback.format_exc().encode(charset) ) self._transport.write(response) self._transport.close() raise @asyncio.coroutine def _request_upstream_helper(self, method, uri, version, headers, body): ''' Forward the request to the upstream server. ''' log.debug('_request_upstream(): method={}, uri={}' \ .format(method, uri)) if self._mitm_host: parsed = urlparse(uri) url = 'https://{}{}'.format(self._mitm_host, parsed.path) else: url = uri token, remaining = self._token_store.dispense(url) log.debug('Signing request with {} token: {}.' .format(token.site, token.public)) if 'instagram' in url: qp = parse_qs(parsed.query) qp['access_token'] = token.public qp['sig'] = self._instagram.oauth_sign( method=method, url=url, token=token, query_params=qp, body_params=parse_qs(body) ) params = ['{}={}'.format(quote(k.encode('utf8')), quote(v.encode('utf8'))) for k,v in qp.items()] uri = '{}?{}'.format(parsed.path, '&'.join(params)) log.debug('Signed instagram URL: {}'.format(uri)) elif 'twitter' in url: headers['Authorization'] = self._twitter.oauth_sign( method=method, url=url, token=token.public, token_secret=token.secret, query_params=parse_qs(parsed.query), body_params=parse_qs(body) ) else: raise ValueError('No signing algorithm known for URL: {}' .format(url)) if self._mitm is None: url = urlparse(uri) host = url.hostname port = url.port if port is None: port = 80 if url.scheme == 'http' else 443 log.debug('Connecting to upstream (plaintext).') upstream = yield from asyncio.open_connection(host, port) upstream_reader, upstream_writer = upstream request = render_http_request(method, uri, version, headers, body) upstream_writer.write(request) response = b'' parser = HttpParser() while True: if not parser.is_headers_complete(): data = yield from upstream_reader.readline() else: data = yield from upstream_reader.read( int(parser.get_headers()['Content-Length']) ) log.debug('Received plaintext from upstream: {}'.format(data)) parser.execute(data, len(data)) if parser.is_partial_body(): body += parser.recv_body() if parser.is_message_complete(): version = self._parser.get_version() status = self._parser.get_status_code() reason = None # For some reason, the parser doesn't expose this :( headers = self._parser.get_headers() if status == 200: self._token_store.update_rate_limit(url, headers) log.debug('Plaintext upstream status: {}'.format(status)) log.debug('Plaintext upstream headers: {}'.format(headers)) log.debug('Plaintext upstream body: {}...'.format(body[:1000])) response = render_http_response( version, status, reason, headers, body ) break upstream_writer.close() else: upstream_write = self._mitm.forward request = render_http_request(method, uri, version, headers, body) upstream_write(request) response = yield from self._mitm.receive() version, status, reason, headers, body = response if status == 200: self._token_store.update_rate_limit(token, url, headers) response = render_http_response( version, status, reason, headers, body ) # Forward the upstream response to the client. self._transport.write(response) self._transport.close() def _set_header(self, key, value): ''' Set a header value. ''' key = key.strip().upper() value = value.strip() self._headers[key] = value @asyncio.coroutine def _start_mitm(self, uri, version): ''' MITM a connection to the upstream server. ''' log.debug('The proxy is starting an MITM connection.') host, port = uri.split(':') port = int(port) self._mitm_host = host _, self._mitm = yield from self._loop.create_connection( lambda: MitmProtocol(self._loop, version, self), host, port, ssl = ssl.create_default_context() )
class TitleFetcher: status_code = 0 followed_times = 0 # 301, 302 finder = None addr = None stream = None max_follows = 10 timeout = 15 _finished = False _cookie = None _connected = False _redirected_stream = None _content_finders = (TitleFinder, PNGFinder, JPEGFinder, GIFFinder) _url_finders = () def __init__(self, url, callback, timeout=None, max_follows=None, io_loop=None, content_finders=None, url_finders=None ): ''' url: the (full) url to fetch callback: called with title or MediaType or an instance of SingletonFactory timeout: total time including redirection before giving up max_follows: max redirections ''' self._callback = callback if max_follows is not None: self.max_follows = max_follows if timeout is not None: self.timeout = timeout if hasattr(tornado.ioloop, 'current'): default_io_loop = tornado.ioloop.IOLoop.current else: default_io_loop = tornado.ioloop.IOLoop.instance self.io_loop = io_loop or default_io_loop() if content_finders is not None: self._content_finders = content_finders if url_finders is not None: self._url_finders = url_finders self.start_time = self.io_loop.time() self._timeout = self.io_loop.add_timeout( self.timeout + self.start_time, self.on_timeout, ) self.origurl = url self.url_visited = [] self.new_url(url) def on_timeout(self): self.run_callback(Timeout) def parse_url(self, url): '''parse `url`, set self.host and return address and stream class''' self.url = u = urlsplit(url) self.host = u.netloc if u.scheme == 'http': addr = u.hostname, u.port or 80 stream = tornado.iostream.IOStream elif u.scheme == 'https': addr = u.hostname, u.port or 443 stream = tornado.iostream.SSLIOStream else: raise ValueError('bad url: %r' % url) return addr, stream def new_connection(self, addr, StreamClass): '''set self.addr, self.stream and connect to host''' s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.addr = addr self.stream = StreamClass(s) logger.debug('%s: connecting to %s...', self.origurl, addr) self.stream.set_close_callback(self.before_connected) self.stream.connect(addr, self.send_request) def new_url(self, url): self.url_visited.append(url) self.fullurl = url for finder in self._url_finders: f = finder.match_url(url, self) if f: self.finder = f f() return addr, StreamClass = self.parse_url(url) if addr != self.addr: if self.stream: self.stream.close() self.new_connection(addr, StreamClass) else: logger.debug('%s: try to reuse existing connection to %s', self.origurl, self.addr) try: self.send_request(nocallback=True) except tornado.iostream.StreamClosedError: logger.debug('%s: server at %s doesn\'t like keep-alive, will reconnect.', self.origurl, self.addr) # The close callback should have already run self.stream.close() self.new_connection(addr, StreamClass) def run_callback(self, arg): self.io_loop.remove_timeout(self._timeout) self._finished = True if self.stream: self.stream.close() self._callback(arg, self) def send_request(self, nocallback=False): self._connected = True req = ('GET %s HTTP/1.1', 'Host: %s', # t.co will return 200 and use js/meta to redirect using the following :-( # 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0', 'User-Agent: %s' % UserAgent, 'Accept: text/html,application/xhtml+xml;q=0.9,*/*;q=0.7', 'Accept-Language: zh-cn,zh;q=0.7,en;q=0.3', 'Accept-Charset: utf-8,gb18030;q=0.7,*;q=0.7', 'Accept-Encoding: gzip, deflate', 'Connection: keep-alive', ) path = self.url.path or '/' if self.url.query: path += '?' + self.url.query req = '\r\n'.join(req) % ( path, self._prepare_host(self.host), ) if self._cookie: req += '\r\n' + self._cookie req += '\r\n\r\n' self.stream.write(req.encode()) self.headers_done = False self.parser = HttpParser(decompress=True) if not nocallback: self.stream.read_until_close( # self.addr will have been changed when close callback is run partial(self.on_data, close=True, addr=self.addr), streaming_callback=self.on_data, ) def _prepare_host(self, host): host = encodings.idna.nameprep(host) return b'.'.join(encodings.idna.ToASCII(x) for x in host.split('.')).decode('ascii') def on_data(self, data, close=False, addr=None): if close: logger.debug('%s: connection to %s closed.', self.origurl, addr) if (close and self._redirected_stream is self.stream) or self._finished: # The connection is closing, and we are being redirected or we're done. self._redirected_stream = None return recved = len(data) logger.debug('%s: received data: %d bytes', self.origurl, recved) p = self.parser nparsed = p.execute(data, recved) if close: # feed EOF p.execute(b'', 0) if not self.headers_done and p.is_headers_complete(): if not self.on_headers_done(): return if p.is_partial_body(): chunk = p.recv_body() if self.finder is None: # redirected but has body received return t = self.feed_finder(chunk) if t is not None: self.run_callback(t) return if p.is_message_complete(): if self.finder is None: # redirected but has body received return t = self.feed_finder(None) # if title not found, t is None self.run_callback(t) elif close: self.run_callback(self.stream.error or ConnectionClosed) def before_connected(self): '''check if something wrong before connected''' if not self._connected and not self._finished: self.run_callback(self.stream.error) def process_cookie(self): setcookie = self.headers.get('Set-Cookie', None) if not setcookie: return cookies = [c.rsplit(None, 1)[-1] for c in setcookie.split('; expires')[:-1]] self._cookie = 'Cookie: ' + '; '.join(cookies) def on_headers_done(self): '''returns True if should proceed, None if should stop for current chunk''' self.headers_done = True self.headers = self.parser.get_headers() self.status_code = self.parser.get_status_code() if self.status_code in (301, 302): self.process_cookie() # or we may be redirecting to a loop logger.debug('%s: redirect to %s', self.origurl, self.headers['Location']) self.followed_times += 1 if self.followed_times > self.max_follows: self.run_callback(TooManyRedirection) else: newurl = urljoin(self.fullurl, self.headers['Location']) self._redirected_stream = self.stream self.new_url(newurl) return try: l = int(self.headers.get('Content-Length', None)) except (ValueError, TypeError): l = None ctype = self.headers.get('Content-Type', 'text/html') mt = defaultMediaType._replace(type=ctype, size=l) for finder in self._content_finders: f = finder.match_type(mt) if f: self.finder = f break else: self.run_callback(mt) return return True def feed_finder(self, chunk): '''feed data to TitleFinder, return the title if found''' t = self.finder(chunk) if t is not None: return t
def parse(self): data = [{ 'label': '以太网帧头部 / Ethernet Headers', 'value': '', 'bold': True, 'children': [{ 'label': '目的端 MAC 地址', 'value': self.ethHeader.destMac }, { 'label': '发送端 MAC 地址', 'value': self.ethHeader.sourceMac }, { 'label': '帧类型', 'value': '%s (0x%s)' % (self.ethHeader.type, self.ethHeader.type_code) }] }] if self.protocol == 'ARP': data.append({ 'label': 'ARP 消息 / Address Resolution Protocol', 'value': '', 'bold': True, 'children': [{ 'label': '硬件类型', 'value': '%s (%s)' % (self.arpBody.hardware_type, self.arpBody.hardware_type_code) }, { 'label': '协议类型', 'value': '%s (0x%s)' % (self.arpBody.protocol_type, self.arpBody.protocol_type_code) }, { 'label': '硬件地址长度', 'value': str(self.arpBody.hardware_size) }, { 'label': '协议地址长度', 'value': str(self.arpBody.protocol_size) }, { 'label': '操作码', 'value': '%s (%s)' % (self.arpBody.operation, self.arpBody.operation_code) }, { 'label': '发送端 MAC 地址', 'value': self.arpBody.sender_mac_address }, { 'label': '发送端 IP 地址', 'value': self.arpBody.sender_ip_address }, { 'label': '目的端 MAC 地址', 'value': self.arpBody.target_mac_address }, { 'label': '目的端 IP 地址', 'value': self.arpBody.target_ip_address }] }) else: if self.ipHeader.version == 4: self.ipHeader.verifyChecksum = verifyChecksum( self.ipHeader.header_raw, [], '').verifyChecksum data.append({ 'label': 'IPv4 头部 / IPv4 Header', 'value': '', 'bold': True, 'children': [{ 'label': '协议版本', 'value': self.ipHeader.version }, { 'label': '头部长度', 'value': str(self.ipHeader.header_length) + ' Bytes' }, { 'label': '服务类型', 'value': '0x%s' % (self.ipHeader.differentiated_services) }, { 'label': '来源 IP', 'value': self.ipHeader.source_ip }, { 'label': '目标 IP', 'value': self.ipHeader.dest_ip }, { 'label': '总长度', 'value': self.ipHeader.total_length }, { 'label': '标识', 'value': '0x%s (%s)' % (self.ipHeader.identification, self.ipHeader.identification_int) }, { 'label': '标志', 'value': '%s' % (self.ipHeader.flags.raw), 'children': [{ 'label': '保留位', 'value': '%s | %s... .... .... ....' % (self.ipHeader.flags.reserved, int(self.ipHeader.flags.reserved)) }, { 'label': 'Don\'t fragment', 'value': '%s | .%s.. .... .... ....' % (self.ipHeader.flags.fragment, int(self.ipHeader.flags.fragment)) }, { 'label': 'More fragments', 'value': '%s | ..%s. .... .... ....' % (self.ipHeader.flags.more_fragment, int(self.ipHeader.flags.more_fragment)) }, { 'label': '分段偏移', 'value': '%s | ...%s' % (self.ipHeader.flags.fragment_offset, self.ipHeader.flags.fragment_offset_bin) }] }, { 'label': '生存期', 'value': self.ipHeader.time_to_live }, { 'label': '协议', 'value': '%s (%s)' % (self.ipHeader.protocol, self.ipHeader.protocol_code) }, { 'label': '校验和', 'value': '0x%s (%s)' % (self.ipHeader.origin_checksum, '校验' + { True: '通过', False: '失败' }[self.ipHeader.verifyChecksum]) }] }) else: ipv6_header = { 'label': 'IPv6 头部 / IPv6 Header', 'value': '', 'bold': True, 'children': [{ 'label': '协议版本', 'value': self.ipHeader.version }, { 'label': '通信分类', 'value': '0x%s' % (self.ipHeader._class) }, { 'label': '流标签', 'value': '0x%s' % (self.ipHeader.float_label) }, { 'label': '有效载荷长度', 'value': self.ipHeader.payload_length }, { 'label': '下一头部类型', 'value': '%s (%s)' % (self.ipHeader.next_header, self.ipHeader.next_header_code) }, { 'label': '跳数限制', 'value': self.ipHeader.hop_limit }, { 'label': '源 IP', 'value': self.ipHeader.source_ip }, { 'label': '目的 IP', 'value': self.ipHeader.dest_ip }] } for option in self.ipHeader.options: ipv6_header['children'].append({ 'label': consts.protocol_types[str(option['code'])], 'value': '0x' + option['value'], 'children': [{ 'label': '下一头部类型', 'value': '%s (%s)' % (consts.protocol_types[str( option['next_header'])], option['next_header']) }] }) data.append(ipv6_header) if self.ipHeader.version == 4 and self.ipHeader.flags.more_fragment == True: # print('Waiting for more fragments.') ids = self.ip_ids[self.ipHeader.identification_int] slicing = { 'label': 'IP 分片', 'value': '共 %s 个数据包' % len(ids), 'bold': True, 'children': [] } for id in ids: slicing['children'].append({ 'label': '#%s' % id, 'value': '%s Bytes' % (self.ip_packets[id].length / 8) }) data.append(slicing) else: if self.ipHeader.protocol == 'TCP': self.ipBody.tcpHeader.verifyChecksum = verifyChecksum( self.ipBody.parameters[0], self.ipBody.parameters[1], self.ipHeader.protocol).verifyChecksum self.ipBody.tcpHeader.options = tcpOptions( BitArray(self.ipBodyRaw) [160:self.ipBody.tcpHeader.header_length * 8]).options tcp_header = { 'label': 'TCP 头部 / Transmission Control Protocol Header', 'value': '', 'bold': True, 'children': [{ 'label': '源端口', 'value': self.ipBody.tcpHeader.source_port }, { 'label': '目的端口', 'value': self.ipBody.tcpHeader.destination_port }, { 'label': '数据序号 (seq)', 'value': self.ipBody.tcpHeader.sequence_number }, { 'label': '确认序号 (ack)', 'value': self.ipBody.tcpHeader.acknowledge_number }, { 'label': '首部长度', 'value': self.ipBody.tcpHeader.header_length }, { 'label': '标志位', 'value': '0x' + self.ipBody.tcpHeader.flags_raw, 'children': [{ 'label': 'Reserved', 'value': '%s | %s. .... ....' % (self.ipBody.tcpHeader.flags.reserved.uint, self.ipBody.tcpHeader.flags.reserved.bin) }, { 'label': 'Nonce', 'value': '%s | ...%d .... ....' % (self.ipBody.tcpHeader.flags.nonce, self.ipBody.tcpHeader.flags.nonce) }, { 'label': 'Congestion Window Reduced', 'value': '%s | .... %d... ....' % (self.ipBody.tcpHeader.flags.cwr, self.ipBody.tcpHeader.flags.cwr) }, { 'label': 'ECN-Echo', 'value': '%s | .... .%d.. ....' % (self.ipBody.tcpHeader.flags.ecn_echo, self.ipBody.tcpHeader.flags.ecn_echo) }, { 'label': 'Urgent', 'value': '%s | .... ..%d. ....' % (self.ipBody.tcpHeader.flags.urgent, self.ipBody.tcpHeader.flags.urgent) }, { 'label': 'Acknowledgment', 'value': '%s | .... ...%d ....' % (self.ipBody.tcpHeader.flags.acknowledgement, self.ipBody.tcpHeader.flags.acknowledgement) }, { 'label': 'Push', 'value': '%s | .... .... %d...' % (self.ipBody.tcpHeader.flags.push, self.ipBody.tcpHeader.flags.push) }, { 'label': 'Reset', 'value': '%s | .... .... .%d..' % (self.ipBody.tcpHeader.flags.reset, self.ipBody.tcpHeader.flags.reset) }, { 'label': 'Syn', 'value': '%s | .... .... ..%d.' % (self.ipBody.tcpHeader.flags.syn, self.ipBody.tcpHeader.flags.syn) }, { 'label': 'Fin', 'value': '%s | .... .... ...%d' % (self.ipBody.tcpHeader.flags.fin, self.ipBody.tcpHeader.flags.fin) }] }, { 'label': '窗口大小', 'value': self.ipBody.tcpHeader.window_size }, { 'label': '校验和', 'value': '0x%s (%s)' % (self.ipBody.tcpHeader.checksum, '校验' + { True: '通过', False: '失败' }[self.ipBody.tcpHeader.verifyChecksum]) }] } options = [] if self.ipBody.tcpHeader.options: for idx in range(len(self.ipBody.tcpHeader.options)): option = { 'label': self.ipBody.tcpHeader.options[idx][0]['label'], 'value': '(%s)' % self.ipBody.tcpHeader.options[idx][0]['value'], 'children': self.ipBody.tcpHeader.options[idx][1:] } options.append(option) if options: tcp_header['children'].append({ 'label': '选项', 'value': '', 'children': options }) data.append(tcp_header) print(self.id) print(tcp_bodies) if self.id in packet_id_struct: tmp = [] http_payload = None for p_id in packet_id_struct[self.id]: tmp.append({'value': '', 'label': '#%s' % p_id}) if self.id in tcp_bodies: # print(tcp_bodies[self.id]['data'].decode('utf-8', 'ignore')) children = [{ 'label': '该包是 TCP 分段的最后一段, 可以通过右下角按钮「导出 TCP 分段数据」.', 'value': '', 'bold': True }, { 'label': '共 %s 个分段' % len(tmp), 'value': '', 'bold': True, 'children': tmp }] try: p = HttpParser() recved = len(tcp_bodies[self.id]['data']) nparsed = p.execute( tcp_bodies[self.id]['data'], recved) assert nparsed == recved headers = [] for header in p.get_headers(): headers.append({ 'label': header, 'value': p.get_headers()[header] }) print(p.get_path(), p.get_url(), p.get_fragment(), p.get_method(), p.get_query_string(), p.get_status_code(), p.get_wsgi_environ()) http_payload = [{ 'label': 'HTTP 版本', 'value': '%s.%s' % (p.get_version()[0], p.get_version()[1]) }, { 'label': 'HTTP 头部', 'value': '', 'children': headers }] if len(p.get_url()) != 0: http_payload.append({ 'label': '请求方式', 'value': p.get_method() }) http_payload.append({ 'label': '路径', 'value': p.get_url() }) http_payload.append({ 'label': '请求参数', 'value': p.get_query_string() }) http_payload.append({ 'label': '主机名', 'value': p.get_wsgi_environ()['HTTP_HOST'] }) else: http_payload.append({ 'label': '状态码', 'value': p.get_status_code() }) except AssertionError: pass else: children = [{ 'label': '共 %s 个分段' % len(tmp), 'value': '', 'bold': True, 'children': tmp }] data.append({ 'label': 'TCP 数据 / TCP Payload', 'value': '', 'bold': True, 'children': children }) if http_payload != None: data.append({ 'label': 'HTTP 数据 / HTTP Data', 'value': '', 'bold': True, 'children': http_payload }) ''' if self.ipBody.tcpBody.has_body: try: p = HttpParser() recved = len(self.ipBody.tcpBody.buf) nparsed = p.execute(self.ipBody.tcpBody.buf, recved) assert nparsed == recved print(p.get_headers()) except AssertionError: print('NOT HTTP') data.append({ 'label': 'TCP 数据 / Data', 'value': '', 'bold': True, 'children': [ { 'label': '数据', 'value': self.ipBody.tcpBody.raw } ] }) ''' elif self.ipHeader.protocol == 'UDP': self.ipBody.udpHeader.verifyChecksum = verifyChecksum( self.ipBody.parameters[0], self.ipBody.parameters[1], self.ipHeader.protocol).verifyChecksum data.append({ 'label': 'UDP 头部 / User Datagram Protocol Header', 'value': '', 'bold': True, 'children': [{ 'label': '源端口', 'value': self.ipBody.udpHeader.source_port }, { 'label': '目的端口', 'value': self.ipBody.udpHeader.destination_port }, { 'label': '长度', 'value': self.ipBody.udpHeader.length }, { 'label': '校验和', 'value': '0x%s (%s)' % (self.ipBody.udpHeader.checksum, '校验' + { True: '通过', False: '失败' }[self.ipBody.udpHeader.verifyChecksum]) }] }) if self.ipBody.udpHeader.source_port == 53 or self.ipBody.udpHeader.destination_port == 53: # DNS children = [{ 'label': '会话标识', 'value': self.ipBody.dnsBody.transaction_id }, { 'label': '标志', 'value': '0x' + self.ipBody.dnsBody.transaction_id }, { 'label': '问题数', 'value': self.ipBody.dnsBody.questions }, { 'label': '回答资源记录数', 'value': self.ipBody.dnsBody.answer_rrs }, { 'label': '授权资源记录数', 'value': self.ipBody.dnsBody.authority_rrs }, { 'label': '附加资源记录数', 'value': self.ipBody.dnsBody.additional_rrs }] if len(self.ipBody.dnsBody.queries) > 0: queries = [] for query in self.ipBody.dnsBody.queries: queries.append({ 'label': str(query.qname), 'value': '', 'bold': True, 'children': [{ 'label': '域名', 'value': str(query.qname) }, { 'label': 'Type', 'value': '%s (%s)' % (consts.dns_types[query.qtype], query.qtype) }, { 'label': 'Class', 'value': '%s (%s)' % (consts.dns_classes[query.qclass], query.qclass) }] }) children.append({ 'label': '查询问题', 'value': '', 'bold': True, 'children': queries }) if len(self.ipBody.dnsBody.answers) > 0: answers = [] for answer in self.ipBody.dnsBody.answers: answers.append({ 'label': str(answer.rname), 'value': '', 'bold': True, 'children': [{ 'label': '域名', 'value': str(answer.rname) }, { 'label': 'Type', 'value': '%s (%s)' % (consts.dns_types[answer.rtype], answer.rtype) }, { 'label': 'Class', 'value': '%s (%s)' % (consts.dns_classes[answer.rclass], answer.rclass) }, { 'label': '生存时间 (ttl)', 'value': str(answer.ttl) }, { 'label': '数据', 'value': str(answer.rdata) }] }) children.append({ 'label': '回答', 'value': '', 'bold': True, 'children': answers }) data.append({ 'label': 'DNS / Domain Name System', 'value': '', 'bold': True, 'children': children }) elif 'ICMP' in self.ipHeader.protocol: if 'IPv6' in self.ipHeader.protocol: self.ipBody.icmpHeader.verifyChecksum = verifyChecksum( self.ipBody.parameters[0], self.ipBody.parameters[1], self.ipHeader.protocol).verifyChecksum else: self.ipBody.icmpHeader.verifyChecksum = verifyChecksum( self.ipBody.parameters, [], '').verifyChecksum data.append({ 'label': 'ICMP 头部 / Internet Control Message Protocol Headers', 'value': '', 'bold': True, 'children': [{ 'label': '类型', 'value': '%s (%s)' % (self.ipBody.icmpHeader.type, self.ipBody.icmpHeader.type_name) }, { 'label': '代码', 'value': self.ipBody.icmpHeader.code }, { 'label': '校验和', 'value': '0x%s (%s)' % (self.ipBody.icmpHeader.checksum, '校验' + { True: '通过', False: '失败' }[self.ipBody.icmpHeader.verifyChecksum]) }] }) elif 'IGMP' in self.ipHeader.protocol: if self.ipHeader.payload_length == 8: self.ipBody.igmpHeader.verifyChecksum = verifyChecksum( self.ipBody.parameters, [], '').verifyChecksum data.append({ 'label': 'IGMP 头部 / Internet Group Management Protocol Headers', 'value': '', 'bold': True, 'children': [{ 'label': '类型', 'value': '0x%s(%s)' % (self.ipBody.igmpHeader.type, self.ipBody.igmpHeader.type_name) }, { 'label': '最大响应时延', 'value': '%s 秒(0x%s)' % (self.ipBody.igmpHeader.maxRespTime, self.ipBody.igmpHeader.maxRespTimeHex) }, { 'label': '校验和', 'value': '0x%s(%s)' % (self.ipBody.igmpHeader.checksum, '校验' + { True: '通过', False: '失败' }[self.ipBody.igmpHeader.verifyChecksum]) }, { 'label': '组地址', 'value': self.ipBody.igmpHeader.groupAddress }] }) else: self.ipBody.igmpv3Header.verifyChecksum = verifyChecksum( self.ipBody.parameters, [], '').verifyChecksum data.append({ 'label': 'IGMPv3 头部 / Internet Group Management Protocol Version 3 Headers', 'value': '', 'bold': True, 'children': [{ 'label': '类型', 'value': '0x%s' % self.ipBody.igmpv3Header.type }, { 'label': '校验和', 'value': '0x%s(%s)' % (self.ipBody.igmpv3Header.checksum, '校验' + { True: '通过', False: '失败' }[self.ipBody.igmpv3Header.verifyChecksum]) }] }) return data