Пример #1
0
            kb.pageEncoding = None
    else:
        kb.pageEncoding = conf.charset

    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and "text/" in contentType.lower():
        # e.g. Ãëàâà
        if "&#" in page:
            page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)

        # e.g. %20%28%29
        if "%" in page:
            page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)

        # e.g. &amp;
        page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
        page = getUnicode(page, kb.pageEncoding)

        # e.g. &#8217;&#8230;&#8482;
        if "&#" in page:
            page = re.sub(r"&#(\d+);", lambda _: unichr(int(_.group(1))), page)

        # e.g. &zeta;
        page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)

    return page

def processResponse(page, responseHeaders):
    kb.processResponseCounter += 1
Пример #2
0
        kb.pageEncoding = conf.charset

    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and any(
            map(lambda x: x in contentType.lower(),
                ("text/txt", "text/raw", "text/html", "text/xml"))):
        # e.g. &#195;&#235;&#224;&#226;&#224;
        if "&#" in page:
            page = re.sub(
                '&#(\d{1,3});', lambda _: chr(int(_.group(1)))
                if int(_.group(1)) < 256 else _.group(0), page)

        # e.g. &amp;
        page = re.sub(
            '&([^;]+);', lambda _: chr(htmlEntities[_.group(1)])
            if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(
            getHeuristicCharEncoding(page))
        page = getUnicode(page, kb.pageEncoding)

        # e.g. &#8217;&#8230;&#8482;
        if "&#" in page:
            page = re.sub('&#(\d+);', lambda _: unichr(int(_.group(1))), page)

        # e.g. &zeta;
        page = re.sub(
            '&([^;]+);', lambda _: unichr(htmlEntities[_.group(1)])
            if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)

    return page
Пример #3
0
        if ((httpCharset or metaCharset) and not all([httpCharset, metaCharset]))\
            or (httpCharset == metaCharset and all([httpCharset, metaCharset])):
            kb.pageEncoding = httpCharset or metaCharset
        else:
            kb.pageEncoding = None
    else:
        kb.pageEncoding = conf.charset

    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and any(map(lambda _: _ in contentType.lower(), ("text/txt", "text/raw", "text/html", "text/xml"))):
        # e.g. &#195;&#235;&#224;&#226;&#224;
        if "&#" in page:
            page = re.sub('&#(\d{1,3});', lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)

        # e.g. &amp;
        page = re.sub('&([^;]+);', lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
        page = getUnicode(page, kb.pageEncoding)

        # e.g. &#8217;&#8230;&#8482;
        if "&#" in page:
            page = re.sub('&#(\d+);', lambda _: unichr(int(_.group(1))), page)

        # e.g. &zeta;
        page = re.sub('&([^;]+);', lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)

    return page

def processResponse(page, responseHeaders):
    kb.processResponseCounter += 1