示例#1
0
def htmlParser(response, encoding):
    rawResponse = response  # raw response returned by requests
    response = response.text  # response content
    if encoding:  # if the user has specified an encoding, encode the probe in that
        response = response.replace(encoding(xsschecker), xsschecker)
    reflections = response.count(xsschecker)
    position_and_context = {}
    environment_details = {}
    clean_response = re.sub(r"<!--[.\s\S]*?-->", "", response)
    script_checkable = clean_response
    for i in range(reflections):
        occurence = re.search(
            r"(?i)(?s)<script[^>]*>.*?(%s).*?</script>" % xsschecker,
            script_checkable)
        if occurence:
            thisPosition = occurence.start(1)
            position_and_context[thisPosition] = "script"
            environment_details[thisPosition] = {}
            environment_details[thisPosition]["details"] = {"quote": ""}
            for i in range(len(occurence.group())):
                currentChar = occurence.group()[i]
                if currentChar in ("'", "`",
                                   '"') and not escaped(i, occurence.group()):
                    environment_details[thisPosition]["details"][
                        "quote"] = currentChar
                elif currentChar in (")", "]", "}", "}") and not escaped(
                        i, occurence.group()):
                    break
            script_checkable = script_checkable.replace(xsschecker, "", 1)
    if len(position_and_context) < reflections:
        attribute_context = re.finditer(r"<[^>]*?(%s)[^>]*?>" % xsschecker,
                                        clean_response)
        for occurence in attribute_context:
            match = occurence.group(0)
            thisPosition = occurence.start(1)
            parts = re.split(r"\s", match)
            tag = parts[0][1:]
            for part in parts:
                if xsschecker in part:
                    Type, quote, name, value = "", "", "", ""
                    if "=" in part:
                        quote = re.search(r'=([\'`"])?', part).group(1)
                        name_and_value = part.split("=")[0], "=".join(
                            part.split("=")[1:])
                        if xsschecker == name_and_value[0]:
                            Type = "name"
                        else:
                            Type = "value"
                        name = name_and_value[0]
                        value = (name_and_value[1].rstrip(">").rstrip(
                            quote).lstrip(quote))
                    else:
                        Type = "flag"
                    position_and_context[thisPosition] = "attribute"
                    environment_details[thisPosition] = {}
                    environment_details[thisPosition]["details"] = {
                        "tag": tag,
                        "type": Type,
                        "quote": quote,
                        "value": value,
                        "name": name,
                    }
    if len(position_and_context) < reflections:
        html_context = re.finditer(xsschecker, clean_response)
        for occurence in html_context:
            thisPosition = occurence.start()
            if thisPosition not in position_and_context:
                position_and_context[occurence.start()] = "html"
                environment_details[thisPosition] = {}
                environment_details[thisPosition]["details"] = {}
    if len(position_and_context) < reflections:
        comment_context = re.finditer(
            r"<!--(?![.\s\S]*-->)[.\s\S]*(%s)[.\s\S]*?-->" % xsschecker,
            response)
        for occurence in comment_context:
            thisPosition = occurence.start(1)
            position_and_context[thisPosition] = "comment"
            environment_details[thisPosition] = {}
            environment_details[thisPosition]["details"] = {}
    database = {}
    for i in sorted(position_and_context):
        database[i] = {}
        database[i]["position"] = i
        database[i]["context"] = position_and_context[i]
        database[i]["details"] = environment_details[i]["details"]

    bad_contexts = re.finditer(
        r"(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*</\1>"
        % xsschecker,
        response,
    )
    non_executable_contexts = []
    for each in bad_contexts:
        non_executable_contexts.append(
            [each.start(), each.end(), each.group(1)])

    if non_executable_contexts:
        for key in database.keys():
            position = database[key]["position"]
            badTag = isBadContext(position, non_executable_contexts)
            if badTag:
                database[key]["details"]["badTag"] = badTag
            else:
                database[key]["details"]["badTag"] = ""
    return database
示例#2
0
def htmlParser(response, encoding):
    rawResponse = response  # raw response returned by requests 请求返回的原始响应
    response = response.text  # response content
    if encoding:  # if the user has specified an encoding, encode the probe in that 如果用户指定了编码,则使用该编码对探测器进行编码
        response = response.replace(encoding(xsschecker), xsschecker)
        # replace() 把字符串中的 old(旧字符串) 替换成 new(新字符串),如果指定第三个参数max,则替换不超过 max 次。
    reflections = response.count(xsschecker)  #用于统计字符串里某个字符 xsschecker 出现的次数
    position_and_context = {}  #位置上下文
    environment_details = {}  #环境细节
    clean_response = re.sub(r'<!--[.\s\S]*?-->', '',
                            response)  #re.sub用于替换字符串中的匹配项 去掉标签
    #[\s\S]*?表示匹配任意字符,且只匹配一次,即懒惰匹配; [\s\S]*没有带?号,也表示匹配任意字符,但允许匹配任意次,即贪婪匹配。
    script_checkable = clean_response
    for script in extractScripts(script_checkable):
        occurences = re.finditer(r'(%s.*?)$' % xsschecker, script)
        #re.finditer(pattern, string, flags=0)返回一个产生匹配对象实体的迭代器,能产生字符串中所有RE模式串的非重叠匹配。
        #字符串被从左向右扫描,匹配按发现顺序返回。空字符串被包括在结果中除非它们触碰到另一个匹配的开头。
        #flags参数是可选参数。如果向它传递re模块中的宏常量,就会对匹配方式产生对应的影响。
        if occurences:
            for occurence in occurences:
                thisPosition = occurence.start(1)
                position_and_context[thisPosition] = 'script'
                environment_details[thisPosition] = {}
                environment_details[thisPosition]['details'] = {'quote': ''}
                for i in range(len(occurence.group())):
                    #group()返回一个或多个匹配的字串。如果只有一个参数,结果只有单个字符串;如果有多个参数,结果是一个元组,元组里每一项对应一个参数。
                    # 没有参数,group1默认是0(整个匹配串被返回)。如果groupN参数是0,对应的返回值是整个匹配串;如果它属于[1,99],返回对应的一项括号分隔的群。
                    # 如果参数是负数或大于模式串中定义的群数,IndexError异常会被抛出。如果模式串没有任何匹配,group返回None;如果模式串多次匹配,group将返回最后一次匹配。
                    currentChar = occurence.group()[i]
                    if currentChar in ('/', '\'', '`', '"') and not escaped(
                            i, occurence.group()):  #编码
                        environment_details[thisPosition]['details'][
                            'quote'] = currentChar
                    elif currentChar in (')', ']', '}', '}') and not escaped(
                            i, occurence.group()):
                        break
                script_checkable = script_checkable.replace(
                    xsschecker, '', 1
                )  #replace("is", "was", 3)返回字符串中的 old(旧字符串) 替换成 new(新字符串)后生成的新字符串,如果指定第三个参数max,则替换不超过 max 次。
    if len(position_and_context) < reflections:
        attribute_context = re.finditer(r'<[^>]*?(%s)[^>]*?>' % xsschecker,
                                        clean_response)
        for occurence in attribute_context:
            match = occurence.group(0)
            thisPosition = occurence.start(1)
            parts = re.split(r'\s', match)  # \s用于匹配空白字符。
            tag = parts[0][1:]
            for part in parts:
                if xsschecker in part:
                    Type, quote, name, value = '', '', '', ''
                    if '=' in part:
                        quote = re.search(r'=([\'`"])?', part).group(1)
                        name_and_value = part.split('=')[0], '='.join(
                            part.split('=')[1:])
                        if xsschecker == name_and_value[0]:
                            Type = 'name'
                        else:
                            Type = 'value'
                        name = name_and_value[0]
                        value = name_and_value[1].rstrip('>').rstrip(
                            quote).lstrip(quote)
                    else:
                        Type = 'flag'
                    position_and_context[thisPosition] = 'attribute'
                    environment_details[thisPosition] = {}
                    environment_details[thisPosition]['details'] = {
                        'tag': tag,
                        'type': Type,
                        'quote': quote,
                        'value': value,
                        'name': name
                    }
    if len(position_and_context) < reflections:
        html_context = re.finditer(xsschecker, clean_response)
        for occurence in html_context:
            thisPosition = occurence.start()
            if thisPosition not in position_and_context:
                position_and_context[occurence.start()] = 'html'
                environment_details[thisPosition] = {}
                environment_details[thisPosition]['details'] = {}
    if len(position_and_context) < reflections:
        comment_context = re.finditer(
            r'<!--[\s\S]*?(%s)[\s\S]*?-->' % xsschecker, response)
        for occurence in comment_context:
            thisPosition = occurence.start(1)
            position_and_context[thisPosition] = 'comment'
            environment_details[thisPosition] = {}
            environment_details[thisPosition]['details'] = {}
    database = {}
    for i in sorted(position_and_context):
        database[i] = {}
        database[i]['position'] = i
        database[i]['context'] = position_and_context[i]
        database[i]['details'] = environment_details[i]['details']

    bad_contexts = re.finditer(
        r'(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*</\1>'
        % xsschecker, response)
    non_executable_contexts = []
    for each in bad_contexts:
        non_executable_contexts.append(
            [each.start(), each.end(), each.group(1)])

    if non_executable_contexts:
        for key in database.keys():
            position = database[key]['position']
            badTag = isBadContext(position, non_executable_contexts)
            if badTag:
                database[key]['details']['badTag'] = badTag
            else:
                database[key]['details']['badTag'] = ''
    return database
示例#3
0
def htmlParser(response, encoding):
    rawResponse = response  # raw response returned by requests
    response = response.text  # response content
    if encoding:  # if the user has specified an encoding, encode the probe in that
        response = response.replace(encoding(xsschecker), xsschecker)
    reflections = response.count(xsschecker)
    position_and_context = {}
    environment_details = {}
    clean_response = re.sub(r'<!--[.\s\S]*?-->', '', response)
    script_checkable = clean_response
    for i in range(reflections):
        occurence = re.search(
            r'(?i)(?s)<script[^>]*>.*?(%s).*?</script>' % xsschecker,
            script_checkable)
        if occurence:
            thisPosition = occurence.start(1)
            position_and_context[thisPosition] = 'script'
            environment_details[thisPosition] = {}
            environment_details[thisPosition]['details'] = {'quote': ''}
            for i in range(len(occurence.group())):
                currentChar = occurence.group()[i]
                if currentChar in ('\'', '`',
                                   '"') and not escaped(i, occurence.group()):
                    environment_details[thisPosition]['details'][
                        'quote'] = currentChar
                elif currentChar in (')', ']', '}', '}') and not escaped(
                        i, occurence.group()):
                    break
            script_checkable = script_checkable.replace(xsschecker, '', 1)
    if len(position_and_context) < reflections:
        attribute_context = re.finditer(r'<[^>]*?(%s)[^>]*?>' % xsschecker,
                                        clean_response)
        for occurence in attribute_context:
            match = occurence.group(0)
            thisPosition = occurence.start(1)
            parts = re.split(r'\s', match)
            tag = parts[0][1:]
            for part in parts:
                if xsschecker in part:
                    Type, quote, name, value = '', '', '', ''
                    if '=' in part:
                        quote = re.search(r'=([\'`"])?', part).group(1)
                        name_and_value = part.split('=')[0], '='.join(
                            part.split('=')[1:])
                        if xsschecker == name_and_value[0]:
                            Type = 'name'
                        else:
                            Type = 'value'
                        name = name_and_value[0]
                        value = name_and_value[1].rstrip('>').rstrip(
                            quote).lstrip(quote)
                    else:
                        Type = 'flag'
                    position_and_context[thisPosition] = 'attribute'
                    environment_details[thisPosition] = {}
                    environment_details[thisPosition]['details'] = {
                        'tag': tag,
                        'type': Type,
                        'quote': quote,
                        'value': value,
                        'name': name
                    }
    if len(position_and_context) < reflections:
        html_context = re.finditer(xsschecker, clean_response)
        for occurence in html_context:
            thisPosition = occurence.start()
            if thisPosition not in position_and_context:
                position_and_context[occurence.start()] = 'html'
                environment_details[thisPosition] = {}
                environment_details[thisPosition]['details'] = {}
    if len(position_and_context) < reflections:
        comment_context = re.finditer(
            r'<!--(?![.\s\S]*-->)[.\s\S]*(%s)[.\s\S]*?-->' % xsschecker,
            response)
        for occurence in comment_context:
            thisPosition = occurence.start(1)
            position_and_context[thisPosition] = 'comment'
            environment_details[thisPosition] = {}
            environment_details[thisPosition]['details'] = {}
    database = {}
    for i in sorted(position_and_context):
        database[i] = {}
        database[i]['position'] = i
        database[i]['context'] = position_and_context[i]
        database[i]['details'] = environment_details[i]['details']

    bad_contexts = re.finditer(
        r'(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*</\1>'
        % xsschecker, response)
    non_executable_contexts = []
    for each in bad_contexts:
        non_executable_contexts.append(
            [each.start(), each.end(), each.group(1)])

    if non_executable_contexts:
        for key in database.keys():
            position = database[key]['position']
            badTag = isBadContext(position, non_executable_contexts)
            if badTag:
                database[key]['details']['badTag'] = badTag
            else:
                database[key]['details']['badTag'] = ''
    return database
示例#4
0
def htmlParser(response, encoding):
    rawResponse = response  # raw response returned by requests
    response = response.text  # response content
    if encoding:  # if the user has specified an encoding, encode the probe in that
        response = response.replace(encoding(xsschecker), xsschecker)
    tags = []  # tags in which the input is reflected
    locations = []  # contexts in which the input is reflected
    attributes = []  # attribute names
    environments = []  # strings needed to break out of the context
    positions = []  # postions of all the reflections of the xsschecker
    for match in re.finditer(xsschecker, response):
        positions.append(match.start())

#  It finds the contexts of the reflections

    parts = response.split(xsschecker)
    # remove first element since it doesn't contain xsschecker
    parts.remove(parts[0])
    # add xsschecker in front of all elements
    parts = [xsschecker + s for s in parts]
    for part in parts:  # iterate over the parts
        deep = part.split('>')
        if '</script' in deep[0]:
            location = 'script'
        elif '</' in deep[0] or len(parts) == 1:
            location = 'html'
        else:
            num = 0
            for i in deep:
                if i[-2:] == '--':
                    if '<!--' not in ''.join(deep[:num + 1]):
                        location = 'comment'
                        break
                        continue
                location = 'script'
                for char in part:
                    # the only way to find out if it's attribute context is to see if '<' is present.
                    if char == '<':
                        location = 'attribute'  # no, it doesn't match '<script>'
                        break
                num += 1
        if '<' not in response:
            if rawResponse.headers['Content-Type'].startswith('text/html'):
                location = 'html'
        locations.append(location)  # add location to locations list

    bad_contexts = re.finditer(
        r'''(?s)(?i)<(style|template|textarea|title|noembed|noscript)>[.\s\S]*(%s)[.\s\S]*</\1>'''
        % xsschecker, response)
    non_executable_contexts = []
    for each in bad_contexts:
        non_executable_contexts.append(
            [each.start(), each.end(), each.group(1)])
#  Finds the "environment" of reflections. is it within double quotes? Which tag contains the reflection?
    num = 0  # dummy value to keep record of occurence being processed
    # find xsschecker in response and return matches
    for occ in re.finditer(xsschecker, response, re.IGNORECASE):
        # convert "xsschecker to EOF" into a list
        toLook = list(response[occ.end():])
        for loc in range(len(toLook)):  # interate over the chars
            if toLook[loc] in ('\'', '"', '`'):  # if the char is a quote
                environments.append(toLook[loc])  # add it to environments list
                tokens = response.split('<')
                goodTokens = []  # tokens which contain xsschecker
                for token in tokens:  # iterate over tokens
                    if xsschecker in token:  # if xsschecker is in token
                        goodTokens.append(token)  # add it to goodTokens list
                        # attributes and their values are generally seperated with space so...
                        attrs = token.split(' ')
                        for attr in attrs:  # iterate over the attribute
                            if xsschecker in attr:  # is xsschecker in this attribute?
                                # alright, this is the one we need
                                attributeName = attr.split('=')[0]
                                attributeValue = ''.join(attr.split('=')[1:])
                                if attributeValue.startswith(
                                        '\'') or attributeValue.startswith(
                                            '"'):
                                    attributeValue = attributeValue[1:-1]
                                attributes.append(
                                    {attributeName: attributeValue})
                                break
                try:
                    # finds the tag "inside" which input is refelcted
                    tag = re.search(r'\w+', goodTokens[num]).group()
                except IndexError:
                    try:
                        # finds the tag "inside" which input is refelcted
                        tag = re.search(r'\w+', goodTokens[num - 1]).group()
                    except IndexError:
                        tag = 'null'
                tags.append(tag)  # add the tag to the tags list
                break
            else:  # if we encounter a closing angular brackt
                # check if the next character to it is a / to make sure its a closing tag
                badContext = isBadContext(positions[num],
                                          non_executable_contexts)
                if badContext:
                    environments.append('</' + badContext + '>')
                else:
                    environments.append('')
                tags.append('')
                attributes.append('')
                break
            loc += 1
        num += 1
    occurences = {
    }  # a dict to store all the collected information about the reflections
    for i, loc, env, tag, attr, position in zip(range(len(locations)),
                                                locations, environments, tags,
                                                attributes, positions):
        occurences[i] = {}
        occurences[i]['position'] = position
        if loc == 'comment':  # if context is html comment
            env = '-->'  # add --> as environment as we need this to break out
        occurences[i]['context'] = [loc, env, tag, attr]
    return [occurences, positions]