def descriptify(doc, base=None, proxy=None): """Clean JavaScript in a html source string. """ parsed = parse_html(doc) newdoc = [] inserted_comment = False for element in parsed: if isinstance(element, HtmlTag): if element.tag in BLOCKED_TAGNAMES: # Asumes there are no void elements in BLOCKED_TAGNAMES # http://www.w3.org/TR/html5/syntax.html#void-elements if not inserted_comment and element.tag_type in ( HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG): newdoc.append('<%s>' % element.tag) inserted_comment = True elif element.tag_type == HtmlTagType.CLOSE_TAG: newdoc.append('</%s>' % element.tag) inserted_comment = False elif element.tag == 'base': element.attributes = {} newdoc.append(serialize_tag(element)) else: for key, val in element.attributes.copy().items(): # Empty intrinsic events if key.startswith('on') or key == "http-equiv": element.attributes[key] = "" elif base and proxy and key == "style" and val is not None: element.attributes[key] = process_css(val, -1, base) elif element.tag in ('frame', 'iframe') and key == 'src': element.attributes[ key] = '/static/frames-not-supported.html' # Rewrite javascript URIs elif key in URI_ATTRIBUTES and val is not None: if _contains_js(unscape(val)): element.attributes[key] = "#" elif base and proxy and not (element.tag == "a" and key == 'href'): element.attributes[key] = wrap_url(val, -1, base) element.attributes['_portia_%s' % key] = val elif base: element.attributes[key] = urljoin(base, val) newdoc.append(serialize_tag(element)) else: text = doc[element.start:element.end] if inserted_comment and text.strip(): newdoc.append('<!-- Removed by portia -->') else: newdoc.append(text) return ''.join(newdoc)
def descriptify(doc, base=None, proxy=None): """Clean JavaScript in a html source string. """ parsed = parse_html(doc) newdoc = [] inserted_comment = False for element in parsed: if isinstance(element, HtmlTag): if element.tag in BLOCKED_TAGNAMES: # Asumes there are no void elements in BLOCKED_TAGNAMES # http://www.w3.org/TR/html5/syntax.html#void-elements if not inserted_comment and element.tag_type in (HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG): newdoc.append('<%s>' % element.tag) inserted_comment = True elif element.tag_type == HtmlTagType.CLOSE_TAG: newdoc.append('</%s>' % element.tag) inserted_comment = False elif element.tag == 'base': element.attributes = {} newdoc.append(serialize_tag(element)) else: for key, val in element.attributes.copy().items(): # Empty intrinsic events if key.startswith('on') or key == "http-equiv": element.attributes[key] = "" elif base and proxy and key == "style" and val is not None: element.attributes[key] = process_css(val, -1, base) elif element.tag in ('frame', 'iframe') and key == 'src': element.attributes[key] = '/static/frames-not-supported.html' # Rewrite javascript URIs elif key in URI_ATTRIBUTES and val is not None: if _contains_js(unscape(val)): element.attributes[key] = "#" elif base and proxy and not (element.tag == "a" and key == 'href'): element.attributes[key] = wrap_url(val, -1, base) element.attributes['_portia_%s' % key] = val elif base: element.attributes[key] = urljoin(base, val) newdoc.append(serialize_tag(element)) else: text = doc[element.start:element.end] if inserted_comment and text.strip(): newdoc.append('<!-- Removed by portia -->') else: newdoc.append(text) return ''.join(newdoc)
def apply_annotations(annotations, target_page, legacy=False): selector_annotations, tagid_annotations = _filter_annotations(annotations) inserts = defaultdict(list) numbered_html = add_tagids(target_page) if selector_annotations: converted_annotations = apply_selector_annotations( selector_annotations, numbered_html) tagid_annotations += converted_annotations target = iter(parse_html(numbered_html)) output, tag_stack = [], [] element = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for grouped in tagid_annotations: for ann in arg_to_iter(grouped): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] + dummy) try: for aid, annotation_data in sorted_annotations: # Move target until replacement/insertion point while True: while not isinstance(element, HtmlTag) or element.tag == 'ins': output.append(numbered_html[element.start:element.end]) element = next(target) if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = element.attributes.get(TAGID) tag_stack.append(last_id) if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack: if ('__added' not in element.attributes and last_id is not None and aid is not None and int(last_id) < int(aid)): output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True last_inserted = tag_stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: element = next(target) try: last_id = element.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(element, HtmlTag): break continue if (last_id is not None and aid is not None and int(last_id) < int(aid)): if '__added' not in element.attributes: output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True element = next(target) else: break generated = [] next_generated = [] regular_annotations = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: regular_annotations.append(annotation) # Add annotations data as required if regular_annotations: annotation_info = _gen_annotation_info(regular_annotations, legacy) for key, val in annotation_info.items(): element.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = _get_generated_annotation( element, generated, nodes, numbered_html, inserts, legacy) if next_generated: inner_data, target = tee(target) open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = _get_generated_annotation( element, next_generated, nodes, numbered_html, inserts, legacy) if '__added' not in element.attributes: output.append(serialize_tag(element)) element.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[element.start:element.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def apply(self): selector_annotations, tagid_annotations = self.split() inserts, numbered_html = defaultdict(list), self.numbered_html if selector_annotations: converted_annotations = self.apply_selector(selector_annotations) tagid_annotations += converted_annotations if not self.legacy: tagid_annotations = self.verify( [arg_to_iter(a) for a in tagid_annotations]) target = iter(parse_html(numbered_html)) output, stack = [], [] elem = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for grouped in tagid_annotations: for ann in arg_to_iter(grouped): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items() if k is not None]) try: for aid, annotation_data in chain(sorted_annotations, dummy): # Move target until replacement/insertion point while True: while not isinstance(elem, HtmlTag) or elem.tag == 'ins': output.append(numbered_html[elem.start:elem.end]) elem = next(target) if elem.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = elem.attributes.get(TAGID) stack.append(last_id) if elem.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and stack: if ('__added' not in elem.attributes and last_id is not None and aid is not None and int(last_id) < int(aid)): output.append(numbered_html[elem.start:elem.end]) elem.attributes['__added'] = True last_inserted = stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: elem = next(target) try: last_id = elem.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(elem, HtmlTag): break continue if (last_id is not None and aid is not None and int(last_id) < int(aid)): if '__added' not in elem.attributes: output.append(numbered_html[elem.start:elem.end]) elem.attributes['__added'] = True elem = next(target) else: break generated = [] next_generated = [] regular_annotations = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: regular_annotations.append(annotation) # Add annotations data as required if regular_annotations: annotation_info = self.generate(regular_annotations) for key, val in annotation_info.items(): elem.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = self._get_generated( elem, generated, nodes, inserts) if next_generated: inner_data, target = tee(target) open_tags = 0 if elem.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = self._get_generated( elem, next_generated, nodes, inserts) if '__added' not in elem.attributes: output.append(serialize_tag(elem)) elem.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[elem.start:elem.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))