def anonym_dom(dom, selectors, mode): ''' Anonymize the script by removing scripts that disclose user info, garble contents in the DOM tree, and stuff them back into big pipes. ''' dom = del_blockcomment(dom) start = 0 m_tag = re_tag.search(dom, start) new_dom = '' while m_tag: # more tags tag = m_tag.group(0) # skipping content before the tag current = m_tag.start() if current > 0: new_dom += dom[start:current] start = current # handle tag if tag[1] == '/': # closing tags are harmless new_dom += tag else: # opening tag new_dom += anonym_tag(tag, selectors, mode) # getting none tag part after the tag start = m_tag.end() current = skip_content(dom, start) if current > start and\ not re_empty.match(dom[start:current]) and\ tag[1:7] != 'script': new_dom += anonym_str(dom[start:current], mode) else: new_dom += dom[start:current] start = current m_tag = re_tag.search(dom, start) return new_dom
def prettify(dom): ''' Add indentation to a DOM file so that it is easier to read. ''' # delete commenting dom = del_blockcomment(dom) start = 0 depth = -1 new_dom = '' m_tag = re_tag.search(dom) while m_tag: # more tags tag = m_tag.group(0) label = re_tag_label.match(tag).group('label') # jump to the start of the tag, what's in front is probably comments current = dom.find(tag, start) if current > 0: new_dom += dom[start:current] start = current # indent tag if label not in EMPTY_ELEMENTS: # has effect on depth if tag[1] == '/': # closing tag depth -= 1 else: # opening tag depth += 1 new_dom += indent(tag, depth) else: # still need to indent by one unit more new_dom += indent(tag, depth + 1) # start a new line for content after a tag start += len(tag) current = skip_content(dom, start) if current > start and not re_empty.match(dom[start:current]): new_dom += indent(dom[start:current], depth + 1) start = current m_tag = re_tag.search(dom[start:]) return new_dom
def selector_index(s): ''' This function scans css to find distinctive selectors. ''' s = del_blockcomment(s) # build indices of selectors: # one for ids (#NAME), one for classes (.CLASS) set_id = set() set_class = set() selectors = re_cssrule.findall(s) for selector in selectors: set_id.update(re_css_id.findall(selector)) set_class.update(re_css_class.findall(selector)) return {'id': set_id, 'class': set_class}