# can't do for all responses because we need to support binary files too if not isinstance(page, unicode) and "text/" in contentType: if kb.heuristicMode: kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) else: # e.g. Ãëàâà if "&#" in page: page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) # e.g. %20%28%29 if "%" in page: page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) # e.g. & page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) if (kb.pageEncoding or "").lower() == "utf-8-sig": kb.pageEncoding = "utf-8" if page and page.startswith("\xef\xbb\xbf"): # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling) page = page[3:] page = getUnicode(page, kb.pageEncoding) # e.g. ’…™ if "&#" in page: def _(match): retVal = match.group(0) try:
# can't do for all responses because we need to support binary files too if contentType and not isinstance(page, unicode) and "text/" in contentType.lower(): if kb.heuristicMode: kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) else: # e.g. Ãëàâà if "&#" in page: page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) # e.g. %20%28%29 if "%" in page: page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) # e.g. & page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) # e.g. ’…™ if "&#" in page: def _(match): retVal = match.group(0) try: retVal = unichr(int(match.group(1))) except ValueError: pass return retVal page = re.sub(r"&#(\d+);", _, page)
else: # e.g. Ãëàâà if "&#" in page: page = re.sub( r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) # e.g. %20%28%29 if "%" in page: page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) # e.g. & page = re.sub( r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding( getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) # e.g. ’…™ if "&#" in page: def _(match): retVal = match.group(0) try: retVal = unichr(int(match.group(1))) except ValueError: pass
def decodePage(page, contentEncoding, contentType): """ Decode compressed/charset HTTP response """ if not page or (conf.nullConnection and len(page) < 2): return getUnicode(page) if hasattr(contentEncoding, "lower"): contentEncoding = contentEncoding.lower() else: contentEncoding = "" if hasattr(contentType, "lower"): contentType = contentType.lower() else: contentType = "" if contentEncoding in ("gzip", "x-gzip", "deflate"): if not kb.pageCompress: return None try: if contentEncoding == "deflate": data = io.BytesIO(zlib.decompress(page, -15)) # Reference: http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations else: data = gzip.GzipFile("", "rb", 9, io.BytesIO(page)) size = struct.unpack("<l", page[-4:])[0] # Reference: http://pydoc.org/get.cgi/usr/local/lib/python2.5/gzip.py if size > MAX_CONNECTION_TOTAL_SIZE: raise Exception("size too large") page = data.read() except Exception as ex: if "<html" not in page: # in some cases, invalid "Content-Encoding" appears for plain HTML (should be ignored) errMsg = "detected invalid data for declared content " errMsg += "encoding '%s' ('%s')" % (contentEncoding, getSafeExString(ex)) singleTimeLogMessage(errMsg, logging.ERROR) warnMsg = "turning off page compression" singleTimeWarnMessage(warnMsg) kb.pageCompress = False raise SqlmapCompressionException if not conf.encoding: httpCharset, metaCharset = None, None # Reference: http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode if contentType.find("charset=") != -1: httpCharset = checkCharEncoding(contentType.split("charset=")[-1]) metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page)) if (any((httpCharset, metaCharset)) and not all((httpCharset, metaCharset))) or (httpCharset == metaCharset and all((httpCharset, metaCharset))): kb.pageEncoding = httpCharset or metaCharset # Reference: http://bytes.com/topic/html-css/answers/154758-http-equiv-vs-true-header-has-precedence debugMsg = "declared web page charset '%s'" % kb.pageEncoding singleTimeLogMessage(debugMsg, logging.DEBUG, debugMsg) else: kb.pageEncoding = None else: kb.pageEncoding = conf.encoding # can't do for all responses because we need to support binary files too if isinstance(page, six.binary_type) and "text/" in contentType: # e.g. 	Ãëàâà if b"&#" in page: page = re.sub(b"&#x([0-9a-f]{1,2});", lambda _: decodeHex(_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)), page) page = re.sub(b"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) # e.g. %20%28%29 if b"%" in page: page = re.sub(b"%([0-9a-fA-F]{2})", lambda _: decodeHex(_.group(1)), page) # e.g. & page = re.sub(b"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) if (kb.pageEncoding or "").lower() == "utf-8-sig": kb.pageEncoding = "utf-8" if page and page.startswith("\xef\xbb\xbf"): # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling) page = page[3:] page = getUnicode(page, kb.pageEncoding) # e.g. ’…™ if "&#" in page: def _(match): retVal = match.group(0) try: retVal = unichr(int(match.group(1))) except (ValueError, OverflowError): pass return retVal page = re.sub(r"&#(\d+);", _, page) # e.g. ζ page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page) return page
page = getUnicode(page, kb.pageEncoding) else: # e.g. Ãëàâà if "&#" in page: page = re.sub( r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page ) # e.g. %20%28%29 if "%" in page: page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) # e.g. & page = re.sub( r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page, ) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) page = getUnicode(page, kb.pageEncoding) # e.g. ’…™ if "&#" in page: def _(match): retVal = match.group(0) try: retVal = unichr(int(match.group(1))) except ValueError: pass
def decodePage(page, contentEncoding, contentType): """ Decode compressed/charset HTTP response """ if not page or (conf.nullConnection and len(page) < 2): return getUnicode(page) if hasattr(contentEncoding, "lower"): contentEncoding = contentEncoding.lower() else: contentEncoding = "" if hasattr(contentType, "lower"): contentType = contentType.lower() else: contentType = "" if contentEncoding in ("gzip", "x-gzip", "deflate"): if not kb.pageCompress: return None try: if contentEncoding == "deflate": data = io.BytesIO(zlib.decompress(page, -15)) # Reference: http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations else: data = gzip.GzipFile("", "rb", 9, io.BytesIO(page)) size = struct.unpack("<l", page[-4:])[0] # Reference: http://pydoc.org/get.cgi/usr/local/lib/python2.5/gzip.py if size > MAX_CONNECTION_TOTAL_SIZE: raise Exception("size too large") page = data.read() except Exception as ex: if "<html" not in page: # in some cases, invalid "Content-Encoding" appears for plain HTML (should be ignored) errMsg = "detected invalid data for declared content " errMsg += "encoding '%s' ('%s')" % (contentEncoding, getSafeExString(ex)) singleTimeLogMessage(errMsg, logging.ERROR) warnMsg = "turning off page compression" singleTimeWarnMessage(warnMsg) kb.pageCompress = False raise SqlmapCompressionException if not conf.encoding: httpCharset, metaCharset = None, None # Reference: http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode if contentType.find("charset=") != -1: httpCharset = checkCharEncoding(contentType.split("charset=")[-1]) metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page)) if (any((httpCharset, metaCharset)) and not all((httpCharset, metaCharset))) or (httpCharset == metaCharset and all((httpCharset, metaCharset))): kb.pageEncoding = httpCharset or metaCharset # Reference: http://bytes.com/topic/html-css/answers/154758-http-equiv-vs-true-header-has-precedence debugMsg = "declared web page charset '%s'" % kb.pageEncoding singleTimeLogMessage(debugMsg, logging.DEBUG, debugMsg) else: kb.pageEncoding = None else: kb.pageEncoding = conf.encoding # can't do for all responses because we need to support binary files too if isinstance(page, six.binary_type) and "text/" in contentType: # e.g. 	Ãëàâà if "&#" in page: page = re.sub(r"&#x([0-9a-f]{1,2});", lambda _: (_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)).decode("hex"), page) page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page) # e.g. %20%28%29 if "%" in page: page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page) # e.g. & page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page) kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page)) if (kb.pageEncoding or "").lower() == "utf-8-sig": kb.pageEncoding = "utf-8" if page and page.startswith("\xef\xbb\xbf"): # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling) page = page[3:] page = getUnicode(page, kb.pageEncoding) # e.g. ’…™ if "&#" in page: def _(match): retVal = match.group(0) try: retVal = unichr(int(match.group(1))) except (ValueError, OverflowError): pass return retVal page = re.sub(r"&#(\d+);", _, page) # e.g. ζ page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page) return page