Пример #1
0
def parse_literal(literal):
    if isinstance(literal["value"], str) or isinstance(literal["value"],
                                                       unicode):
        if literal["value"][0] == '"':
            parser.process_content(literal["value"], contexts.JS_QUOT)
        elif literal["value"][0] == "'":
            parser.process_content(literal["value"], contexts.JS_APOS)
        else:
            raise JsParserError("Incorrect quoting of literal")
Пример #2
0
def parse_js(content):
    from datetime import datetime
    start = datetime.now()
    js_parser = PyJsParser()
    try:
        tree = js_parser.parse(content)
        for expr in tree["body"]:
            parse_expr(expr)
    except JsSyntaxError:
        parser.process_content(content, contexts.JS_CODE)
    end = datetime.now()
    library.js_us += end - start
Пример #3
0
def parse_css_declaration_text(text):
    from datetime import datetime
    start = datetime.now()
    declaration = CSSStyleDeclaration()
    try:
        declaration.cssText = text
    except Exception:
        # Parsing failed
        parser.process_content(text, contexts.CSS_UNKNOWN)
    parse_css_declaration(declaration)
    end = datetime.now()
    library.css_us += end - start
Пример #4
0
def parse_css_stylesheet(content):
    from datetime import datetime
    start = datetime.now()
    sheet = CSSStyleSheet()
    try:
        sheet.cssText = content
    except Exception:
        # Parsing failed
        parser.process_content(content, contexts.CSS_UNKNOWN)
    for rule in sheet.cssRules:
        parse_css_rule(rule)
    end = datetime.now()
    library.css_us += end - start
Пример #5
0
def parse_expr(expr):
    if isinstance(expr, dict):
        if "type" in expr and expr["type"] == "Literal":
            parse_literal(expr)
        else:
            for key in expr:
                parse_expr(key)
                parse_expr(expr[key])
    elif isinstance(expr, list) or isinstance(expr, set) or isinstance(
            expr, tuple):
        for item in expr:
            parse_expr(item)
    elif isinstance(expr, str) or isinstance(expr, unicode):
        parser.process_content(expr, contexts.JS_CODE)
Пример #6
0
def process_attributes(tag):
    for attr in tag.attrs:
        parser.process_content(attr, contexts.HTML_ATTR_NAME)
        attr_value = tag.attrs[attr]
        if isinstance(attr_value, list):
            for value in attr_value:
                process_attr_value(attr, value)
        elif isinstance(attr_value, ContentMetaAttributeValue):
            if tag.attrs["http-equiv"].lower(
            ) == "refresh" and attr_value.lower().find("url=") != -1:
                process_meta_url_value(attr_value.original_value)
            else:
                process_attr_value(attr, attr_value.original_value)
        elif isinstance(attr_value, CharsetMetaAttributeValue):
            process_attr_value(attr, attr_value.original_value)
        else:
            process_attr_value(attr, attr_value)
Пример #7
0
def parse_uri(content):
    from datetime import datetime
    start = datetime.now()
    # javascript: scheme
    if content.lower().find('javascript:') == 0:
        parser.process_content(content[len('javascript:'):], contexts.URI_JS)
        content = urllib.unquote(content).decode("utf8")
        js_parser.parse_js(content)

    # data: scheme
    elif content.lower().find('data:') == 0:
        content = content[len('data:'):]

        # placeholder is in content type
        if content.find(",") == -1:
            # invalid format of data: scheme
            parser.process_content(content, contexts.URI_UNKNOWN_DATA)
            return
        parser.process_content(content[0:content.find(",")],
                               contexts.URI_CONTENT_TYPE)

        # extracts content-type, encoding and charset
        # if encoding not found, uses urlencode
        # if encoding urlencode and charset not found, uses utf8
        enctype = "urlencode"
        if content.find(";") != -1 and content.find(",") > content.find(";"):
            content_type = content[0:content.find(";")]
            encoding = content[content.find(";") + 1:content.find(",")]
            # placeholder is in encoding
            parser.process_content(encoding, contexts.URI_UNKNOWN_DATA)
            if encoding.tolower() == "base64":
                enctype = "base64"
            elif encoding.tolower().find("charset=") == 0:
                charset = encoding[0, encoding.tolower().find("charset=")]
            else:
                charset = "utf8"
        else:
            content_type = content[0:content.find(",")]
            charset = "utf8"

        # decode content
        content = content[content.find(",")]
        if enctype == "base64":
            content = b64decode(content)
        else:
            content = urllib.unquote(content).decode(charset)

        # subprocess content according to the content type
        if content_type.lower() == "text/html":
            parser.process_content(content, contexts.URI_HTML_DATA)
            html_parser.parse_html(content)
        elif content_type.lower() == "text/css":
            parser.process_content(content, contexts.URI_CSS_DATA)
            css_parser.parse_css_stylesheet(content)
        elif content_type.lower() == "text/javascript" or content_type.lower(
        ) == "application/x-javascript" or content_type.lower(
        ) == "application/javascript":
            parser.process_content(content, contexts.URI_JS_DATA)
            js_parser.parse_js(content)
        else:
            parser.process_content(content, contexts.URI_OTHER_DATA)

    # other schemes
    else:
        parser.process_content(content, contexts.URI_URL)
    end = datetime.now()
    library.uri_us += end - start
Пример #8
0
def parse_css_rule(rule):
    if rule.type == CSSRule.UNKNOWN_RULE:
        parser.process_content(rule, contexts.CSS_UNKNOWN)
    elif rule.type == CSSRule.STYLE_RULE:
        for selector in rule.selectorList:
            parse_selector(selector)
        parse_css_declaration(rule.style)
    elif rule.type == CSSRule.PAGE_RULE:
        parse_css_declaration(rule.style)
        selectorList = SelectorList(selectorText=rule.selectorText)
        for selector in selectorList:
            parse_selector(selector)
    elif rule.type == CSSRule.CHARSET_RULE:
        parser.process_content(rule.encoding, contexts.CSS_CHARSET)
    elif rule.type == CSSRule.IMPORT_RULE:
        if rule.media is not None:
            parse_media_list(rule.media)
        if rule.href.quoting == "DOUBLE":
            parser.process_content(rule.href, contexts.CSS_QUOT_URI)
        elif rule.href.quoting == "SINGLE":
            parser.process_content(rule.href, contexts.CSS_APOS_URI)
        elif rule.href.quoting == "NONE":
            parser.process_content(rule.href, contexts.CSS_UNQUOT_URI)
        else:
            raise CssParserError("Incorrectly quoted CSS import URI")
        uri_parser.parse_uri(rule.href)
    elif rule.type == CSSRule.MEDIA_RULE:
        parse_media_list(rule.media)
        for inner in rule.cssRules:
            parse_css_rule(inner)
    elif rule.type == CSSRule.FONT_FACE_RULE:
        parse_css_declaration(rule.style)
    elif rule.type == CSSRule.NAMESPACE_RULE:
        if rule.prefix is not None:
            parser.process_content(rule.prefix, contexts.CSS_QUOT_STRING)
        if rule.namespaceURI is not None:
            if rule.namespaceURI.quoting == "DOUBLE":
                parser.process_content(rule.namespaceURI, contexts.CSS_QUOT_URI)
            elif rule.namespaceURI.quoting == "SINGLE":
                parser.process_content(rule.namespaceURI, contexts.CSS_APOS_URI)
            elif rule.namespaceURI.quoting == "NONE":
                parser.process_content(rule.namespaceURI, contexts.CSS_UNQUOT_URI)
            else:
                raise CssParserError("Incorrect quoting CSS namespace URI")
            uri_parser.parse_uri(rule.namespaceURI)
    elif rule.type == CSSRule.COMMENT:
        parser.process_content(unicode(rule), contexts.CSS_COMMENT)
    elif rule.type == CSSRule.VARIABLES_RULE:
        for var in rule.variables:
            parser.process_content(rule, contexts.CSS_ENTITY)
            parse_css_declaration(rule.variables[var])
        if rule.media is not None:
            parse_media_list(rule.media)
    elif rule.type == CSSRule.MARGIN_RULE:
        parse_css_declaration(rule.style)
    else:
        raise CssParserError("Wrong CSS Rule")
Пример #9
0
def parse_property_value(value):
    if value.type == Value.IDENT:
        parser.process_content(value.value, contexts.CSS_PROPERTY_VALUE)
    elif value.type == Value.STRING:
        if value.value.quoting == "DOUBLE":
            parser.process_content(value.value, contexts.CSS_QUOT_STRING)
        elif value.value.quoting == "SINGLE":
            parser.process_content(value.value, contexts.CSS_APOS_STRING)
        else:
            raise CssParserError("Incorrect quoting of a CSS STRING")
    elif value.type == Value.UNICODE_RANGE:
        # Data type is tuple of two integers
        pass
    elif value.type == Value.URI:
        if value.value.quoting == "DOUBLE":
            parser.process_content(value.value, contexts.CSS_QUOT_URI)
        elif value.value.quoting == "SINGLE":
            parser.process_content(value.value, contexts.CSS_APOS_URI)
        elif value.value.quoting == "NONE":
            parser.process_content(value.value, contexts.CSS_UNQUOT_URI)
        else:
            raise CssParserError("Incorrect quoting of a CSS URI")
        uri_parser.parse_uri(value.value)
    elif value.type == Value.DIMENSION:
        parser.process_content(value.dimension, contexts.CSS_ENTITY)
    elif value.type == Value.NUMBER:
        # Data type is integer
        pass
    elif value.type == Value.PERCENTAGE:
        # Data type is integer
        pass
    elif value.type == Value.COLOR_VALUE:
        # Data type is tuple of three integers
        pass
    elif value.type == Value.HASH:
        parser.process_content(value.value, contexts.CSS_ENTITY)
    elif value.type == Value.FUNCTION:
        parser.process_content(value.value, contexts.CSS_ENTITY)
    elif value.type == Value.CALC:
        parser.process_content(value.value, contexts.CSS_ENTITY)
    elif value.type == Value.VARIABLE:
        parser.process_content(value.value, contexts.CSS_ENTITY)
    else:
        raise CssParserError("Wrong CSS Value")
Пример #10
0
def parse_property(prop):
    parser.process_content(prop.name, contexts.CSS_PROPERTY_NAME)
    parser.process_content(prop.priority, contexts.CSS_ENTITY)
    for value in prop.propertyValue:
        parse_property_value(value)
Пример #11
0
def parse_selector(selector):
    parser.process_content(selector.selectorText, contexts.CSS_ENTITY)
Пример #12
0
def parse_media_list(media_list):
    for medium in media_list:
        parser.process_content(medium.value, contexts.CSS_MEDIA_ITEM)
Пример #13
0
def process_meta_url_value(value):
    front = value[0:value.lower().find("url=")]
    back = value[value.lower().find("url=") + 4:]
    if value.quoting == "DOUBLE":
        parser.process_content(front, context.HTML_QUOT_ATTR)
        parser.process_content(back, context.HTML_QUOT_URI)
    elif value.quoting == "SINGLE":
        parser.process_content(front, context.HTML_APOS_ATTR)
        parser.process_content(back, context.HTML_APOS_URI)
    elif value.quoting == "NONE":
        parser.process_content(front, context.HTML_UNQUOT_ATTR)
        parser.process_content(back, context.HTML_UNQUOT_URI)
    else:
        raise HtmlParserError("Wrong quoting of meta refresh content")
    back = back.strip()
    back = back.strip(["'", '"'])
    uri_parser.parse_uri(back)
Пример #14
0
def process_attr_value(name, value):
    if name.lower() in URL_ATTRS:
        if value.quoting == "DOUBLE":
            parser.process_content(value, contexts.HTML_QUOT_URI)
        elif value.quoting == "SINGLE":
            parser.process_content(value, contexts.HTML_APOS_URI)
        elif value.quoting == "NONE":
            parser.process_content(value, contexts.HTML_UNQUOT_URI)
        else:
            raise HtmlParserError("Wrong quoting of URI")
        uri_parser.parse_uri(value)
    elif name.lower()[0:2] == "on":
        if value.quoting == "DOUBLE":
            parser.process_content(value, contexts.HTML_QUOT_JS)
        elif value.quoting == "SINGLE":
            parser.process_content(value, contexts.HTML_APOS_JS)
        elif value.quoting == "NONE":
            parser.process_content(value, contexts.HTML_UNQUOT_JS)
        else:
            raise HtmlParserError("Wrong quoting of attr with JS")
        js_parser.parse_js(value)
    elif name.lower() == "style":
        if value.quoting == "DOUBLE":
            parser.process_content(value, contexts.HTML_QUOT_CSS)
        elif value.quoting == "SINGLE":
            parser.process_content(value, contexts.HTML_APOS_CSS)
        elif value.quoting == "NONE":
            parser.process_content(value, contexts.HTML_UNQUOT_CSS)
        else:
            raise HtmlParserError("Wrong quoting of attr with CSS")
        css_parser.parse_css_declaration_text(value)
    else:
        if value.quoting == "DOUBLE":
            parser.process_content(value, contexts.HTML_QUOT_ATTR)
        elif value.quoting == "SINGLE":
            parser.process_content(value, contexts.HTML_APOS_ATTR)
        elif value.quoting == "NONE":
            parser.process_content(value, contexts.HTML_UNQUOT_ATTR)
        else:
            raise HtmlParserError("Wrong quoting of generic HTML attr")
Пример #15
0
def process_element(element):
    if isinstance(element, Comment):
        parser.process_content(unicode(element), contexts.HTML_COMMENT)
    elif isinstance(element, Doctype):
        parser.process_content(unicode(element), contexts.HTML_DOCTYPE)
    elif isinstance(element, NavigableString):
        if element.parent.name.lower() == "script":
            if "type" not in element.parent.attrs or element.parent.attrs[
                    "type"].lower().find("javascript") != -1:
                parser.process_content(unicode(element), contexts.HTML_JS_DATA)
                js_parser.parse_js(unicode(element))
            else:
                parser.process_content(unicode(element),
                                       contexts.HTML_UNKNOWN_SCRIPT)
        elif element.parent.name.lower() == "style":
            parser.process_content(unicode(element), contexts.HTML_CSS_DATA)
            css_parser.parse_css_stylesheet(unicode(element))
        else:
            parser.process_content(unicode(element), contexts.HTML_TEXT)
    elif isinstance(element, Tag):
        parser.process_content(element.name, contexts.HTML_TAGNAME)
        process_attributes(element)
        process_children(element)
    else:
        raise HtmlParserError("Unknown HTML element")