def parse_literal(literal): if isinstance(literal["value"], str) or isinstance(literal["value"], unicode): if literal["value"][0] == '"': parser.process_content(literal["value"], contexts.JS_QUOT) elif literal["value"][0] == "'": parser.process_content(literal["value"], contexts.JS_APOS) else: raise JsParserError("Incorrect quoting of literal")
def parse_js(content): from datetime import datetime start = datetime.now() js_parser = PyJsParser() try: tree = js_parser.parse(content) for expr in tree["body"]: parse_expr(expr) except JsSyntaxError: parser.process_content(content, contexts.JS_CODE) end = datetime.now() library.js_us += end - start
def parse_css_declaration_text(text): from datetime import datetime start = datetime.now() declaration = CSSStyleDeclaration() try: declaration.cssText = text except Exception: # Parsing failed parser.process_content(text, contexts.CSS_UNKNOWN) parse_css_declaration(declaration) end = datetime.now() library.css_us += end - start
def parse_css_stylesheet(content): from datetime import datetime start = datetime.now() sheet = CSSStyleSheet() try: sheet.cssText = content except Exception: # Parsing failed parser.process_content(content, contexts.CSS_UNKNOWN) for rule in sheet.cssRules: parse_css_rule(rule) end = datetime.now() library.css_us += end - start
def parse_expr(expr): if isinstance(expr, dict): if "type" in expr and expr["type"] == "Literal": parse_literal(expr) else: for key in expr: parse_expr(key) parse_expr(expr[key]) elif isinstance(expr, list) or isinstance(expr, set) or isinstance( expr, tuple): for item in expr: parse_expr(item) elif isinstance(expr, str) or isinstance(expr, unicode): parser.process_content(expr, contexts.JS_CODE)
def process_attributes(tag): for attr in tag.attrs: parser.process_content(attr, contexts.HTML_ATTR_NAME) attr_value = tag.attrs[attr] if isinstance(attr_value, list): for value in attr_value: process_attr_value(attr, value) elif isinstance(attr_value, ContentMetaAttributeValue): if tag.attrs["http-equiv"].lower( ) == "refresh" and attr_value.lower().find("url=") != -1: process_meta_url_value(attr_value.original_value) else: process_attr_value(attr, attr_value.original_value) elif isinstance(attr_value, CharsetMetaAttributeValue): process_attr_value(attr, attr_value.original_value) else: process_attr_value(attr, attr_value)
def parse_uri(content): from datetime import datetime start = datetime.now() # javascript: scheme if content.lower().find('javascript:') == 0: parser.process_content(content[len('javascript:'):], contexts.URI_JS) content = urllib.unquote(content).decode("utf8") js_parser.parse_js(content) # data: scheme elif content.lower().find('data:') == 0: content = content[len('data:'):] # placeholder is in content type if content.find(",") == -1: # invalid format of data: scheme parser.process_content(content, contexts.URI_UNKNOWN_DATA) return parser.process_content(content[0:content.find(",")], contexts.URI_CONTENT_TYPE) # extracts content-type, encoding and charset # if encoding not found, uses urlencode # if encoding urlencode and charset not found, uses utf8 enctype = "urlencode" if content.find(";") != -1 and content.find(",") > content.find(";"): content_type = content[0:content.find(";")] encoding = content[content.find(";") + 1:content.find(",")] # placeholder is in encoding parser.process_content(encoding, contexts.URI_UNKNOWN_DATA) if encoding.tolower() == "base64": enctype = "base64" elif encoding.tolower().find("charset=") == 0: charset = encoding[0, encoding.tolower().find("charset=")] else: charset = "utf8" else: content_type = content[0:content.find(",")] charset = "utf8" # decode content content = content[content.find(",")] if enctype == "base64": content = b64decode(content) else: content = urllib.unquote(content).decode(charset) # subprocess content according to the content type if content_type.lower() == "text/html": parser.process_content(content, contexts.URI_HTML_DATA) html_parser.parse_html(content) elif content_type.lower() == "text/css": parser.process_content(content, contexts.URI_CSS_DATA) css_parser.parse_css_stylesheet(content) elif content_type.lower() == "text/javascript" or content_type.lower( ) == "application/x-javascript" or content_type.lower( ) == "application/javascript": parser.process_content(content, contexts.URI_JS_DATA) js_parser.parse_js(content) else: parser.process_content(content, contexts.URI_OTHER_DATA) # other schemes else: parser.process_content(content, contexts.URI_URL) end = datetime.now() library.uri_us += end - start
def parse_css_rule(rule): if rule.type == CSSRule.UNKNOWN_RULE: parser.process_content(rule, contexts.CSS_UNKNOWN) elif rule.type == CSSRule.STYLE_RULE: for selector in rule.selectorList: parse_selector(selector) parse_css_declaration(rule.style) elif rule.type == CSSRule.PAGE_RULE: parse_css_declaration(rule.style) selectorList = SelectorList(selectorText=rule.selectorText) for selector in selectorList: parse_selector(selector) elif rule.type == CSSRule.CHARSET_RULE: parser.process_content(rule.encoding, contexts.CSS_CHARSET) elif rule.type == CSSRule.IMPORT_RULE: if rule.media is not None: parse_media_list(rule.media) if rule.href.quoting == "DOUBLE": parser.process_content(rule.href, contexts.CSS_QUOT_URI) elif rule.href.quoting == "SINGLE": parser.process_content(rule.href, contexts.CSS_APOS_URI) elif rule.href.quoting == "NONE": parser.process_content(rule.href, contexts.CSS_UNQUOT_URI) else: raise CssParserError("Incorrectly quoted CSS import URI") uri_parser.parse_uri(rule.href) elif rule.type == CSSRule.MEDIA_RULE: parse_media_list(rule.media) for inner in rule.cssRules: parse_css_rule(inner) elif rule.type == CSSRule.FONT_FACE_RULE: parse_css_declaration(rule.style) elif rule.type == CSSRule.NAMESPACE_RULE: if rule.prefix is not None: parser.process_content(rule.prefix, contexts.CSS_QUOT_STRING) if rule.namespaceURI is not None: if rule.namespaceURI.quoting == "DOUBLE": parser.process_content(rule.namespaceURI, contexts.CSS_QUOT_URI) elif rule.namespaceURI.quoting == "SINGLE": parser.process_content(rule.namespaceURI, contexts.CSS_APOS_URI) elif rule.namespaceURI.quoting == "NONE": parser.process_content(rule.namespaceURI, contexts.CSS_UNQUOT_URI) else: raise CssParserError("Incorrect quoting CSS namespace URI") uri_parser.parse_uri(rule.namespaceURI) elif rule.type == CSSRule.COMMENT: parser.process_content(unicode(rule), contexts.CSS_COMMENT) elif rule.type == CSSRule.VARIABLES_RULE: for var in rule.variables: parser.process_content(rule, contexts.CSS_ENTITY) parse_css_declaration(rule.variables[var]) if rule.media is not None: parse_media_list(rule.media) elif rule.type == CSSRule.MARGIN_RULE: parse_css_declaration(rule.style) else: raise CssParserError("Wrong CSS Rule")
def parse_property_value(value): if value.type == Value.IDENT: parser.process_content(value.value, contexts.CSS_PROPERTY_VALUE) elif value.type == Value.STRING: if value.value.quoting == "DOUBLE": parser.process_content(value.value, contexts.CSS_QUOT_STRING) elif value.value.quoting == "SINGLE": parser.process_content(value.value, contexts.CSS_APOS_STRING) else: raise CssParserError("Incorrect quoting of a CSS STRING") elif value.type == Value.UNICODE_RANGE: # Data type is tuple of two integers pass elif value.type == Value.URI: if value.value.quoting == "DOUBLE": parser.process_content(value.value, contexts.CSS_QUOT_URI) elif value.value.quoting == "SINGLE": parser.process_content(value.value, contexts.CSS_APOS_URI) elif value.value.quoting == "NONE": parser.process_content(value.value, contexts.CSS_UNQUOT_URI) else: raise CssParserError("Incorrect quoting of a CSS URI") uri_parser.parse_uri(value.value) elif value.type == Value.DIMENSION: parser.process_content(value.dimension, contexts.CSS_ENTITY) elif value.type == Value.NUMBER: # Data type is integer pass elif value.type == Value.PERCENTAGE: # Data type is integer pass elif value.type == Value.COLOR_VALUE: # Data type is tuple of three integers pass elif value.type == Value.HASH: parser.process_content(value.value, contexts.CSS_ENTITY) elif value.type == Value.FUNCTION: parser.process_content(value.value, contexts.CSS_ENTITY) elif value.type == Value.CALC: parser.process_content(value.value, contexts.CSS_ENTITY) elif value.type == Value.VARIABLE: parser.process_content(value.value, contexts.CSS_ENTITY) else: raise CssParserError("Wrong CSS Value")
def parse_property(prop): parser.process_content(prop.name, contexts.CSS_PROPERTY_NAME) parser.process_content(prop.priority, contexts.CSS_ENTITY) for value in prop.propertyValue: parse_property_value(value)
def parse_selector(selector): parser.process_content(selector.selectorText, contexts.CSS_ENTITY)
def parse_media_list(media_list): for medium in media_list: parser.process_content(medium.value, contexts.CSS_MEDIA_ITEM)
def process_meta_url_value(value): front = value[0:value.lower().find("url=")] back = value[value.lower().find("url=") + 4:] if value.quoting == "DOUBLE": parser.process_content(front, context.HTML_QUOT_ATTR) parser.process_content(back, context.HTML_QUOT_URI) elif value.quoting == "SINGLE": parser.process_content(front, context.HTML_APOS_ATTR) parser.process_content(back, context.HTML_APOS_URI) elif value.quoting == "NONE": parser.process_content(front, context.HTML_UNQUOT_ATTR) parser.process_content(back, context.HTML_UNQUOT_URI) else: raise HtmlParserError("Wrong quoting of meta refresh content") back = back.strip() back = back.strip(["'", '"']) uri_parser.parse_uri(back)
def process_attr_value(name, value): if name.lower() in URL_ATTRS: if value.quoting == "DOUBLE": parser.process_content(value, contexts.HTML_QUOT_URI) elif value.quoting == "SINGLE": parser.process_content(value, contexts.HTML_APOS_URI) elif value.quoting == "NONE": parser.process_content(value, contexts.HTML_UNQUOT_URI) else: raise HtmlParserError("Wrong quoting of URI") uri_parser.parse_uri(value) elif name.lower()[0:2] == "on": if value.quoting == "DOUBLE": parser.process_content(value, contexts.HTML_QUOT_JS) elif value.quoting == "SINGLE": parser.process_content(value, contexts.HTML_APOS_JS) elif value.quoting == "NONE": parser.process_content(value, contexts.HTML_UNQUOT_JS) else: raise HtmlParserError("Wrong quoting of attr with JS") js_parser.parse_js(value) elif name.lower() == "style": if value.quoting == "DOUBLE": parser.process_content(value, contexts.HTML_QUOT_CSS) elif value.quoting == "SINGLE": parser.process_content(value, contexts.HTML_APOS_CSS) elif value.quoting == "NONE": parser.process_content(value, contexts.HTML_UNQUOT_CSS) else: raise HtmlParserError("Wrong quoting of attr with CSS") css_parser.parse_css_declaration_text(value) else: if value.quoting == "DOUBLE": parser.process_content(value, contexts.HTML_QUOT_ATTR) elif value.quoting == "SINGLE": parser.process_content(value, contexts.HTML_APOS_ATTR) elif value.quoting == "NONE": parser.process_content(value, contexts.HTML_UNQUOT_ATTR) else: raise HtmlParserError("Wrong quoting of generic HTML attr")
def process_element(element): if isinstance(element, Comment): parser.process_content(unicode(element), contexts.HTML_COMMENT) elif isinstance(element, Doctype): parser.process_content(unicode(element), contexts.HTML_DOCTYPE) elif isinstance(element, NavigableString): if element.parent.name.lower() == "script": if "type" not in element.parent.attrs or element.parent.attrs[ "type"].lower().find("javascript") != -1: parser.process_content(unicode(element), contexts.HTML_JS_DATA) js_parser.parse_js(unicode(element)) else: parser.process_content(unicode(element), contexts.HTML_UNKNOWN_SCRIPT) elif element.parent.name.lower() == "style": parser.process_content(unicode(element), contexts.HTML_CSS_DATA) css_parser.parse_css_stylesheet(unicode(element)) else: parser.process_content(unicode(element), contexts.HTML_TEXT) elif isinstance(element, Tag): parser.process_content(element.name, contexts.HTML_TAGNAME) process_attributes(element) process_children(element) else: raise HtmlParserError("Unknown HTML element")