示例#1
0
文件: html.py 项目: monocleman1/dd
def descriptify(doc, base=None, proxy=None):
    """Clean JavaScript in a html source string.
    """
    parsed = parse_html(doc)
    newdoc = []
    inserted_comment = False
    for element in parsed:
        if isinstance(element, HtmlTag):
            if element.tag in BLOCKED_TAGNAMES:
                # Asumes there are no void elements in BLOCKED_TAGNAMES
                # http://www.w3.org/TR/html5/syntax.html#void-elements
                if not inserted_comment and element.tag_type in (
                        HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG):
                    newdoc.append('<%s>' % element.tag)
                    inserted_comment = True
                elif element.tag_type == HtmlTagType.CLOSE_TAG:
                    newdoc.append('</%s>' % element.tag)
                    inserted_comment = False
            elif element.tag == 'base':
                element.attributes = {}
                newdoc.append(serialize_tag(element))
            else:
                for key, val in element.attributes.copy().items():
                    # Empty intrinsic events
                    if key.startswith('on') or key == "http-equiv":
                        element.attributes[key] = ""
                    elif base and proxy and key == "style" and val is not None:
                        element.attributes[key] = process_css(val, -1, base)
                    elif element.tag in ('frame', 'iframe') and key == 'src':
                        element.attributes[
                            key] = '/static/frames-not-supported.html'
                    # Rewrite javascript URIs
                    elif key in URI_ATTRIBUTES and val is not None:
                        if _contains_js(unscape(val)):
                            element.attributes[key] = "#"
                        elif base and proxy and not (element.tag == "a"
                                                     and key == 'href'):
                            element.attributes[key] = wrap_url(val, -1, base)
                            element.attributes['_portia_%s' % key] = val
                        elif base:
                            element.attributes[key] = urljoin(base, val)
                newdoc.append(serialize_tag(element))
        else:
            text = doc[element.start:element.end]
            if inserted_comment and text.strip():
                newdoc.append('<!-- Removed by portia -->')
            else:
                newdoc.append(text)

    return ''.join(newdoc)
示例#2
0
def descriptify(doc, base=None, proxy=None):
    """Clean JavaScript in a html source string.
    """
    parsed = parse_html(doc)
    newdoc = []
    inserted_comment = False
    for element in parsed:
        if isinstance(element, HtmlTag):
            if element.tag in BLOCKED_TAGNAMES:
                # Asumes there are no void elements in BLOCKED_TAGNAMES
                # http://www.w3.org/TR/html5/syntax.html#void-elements
                if not inserted_comment and element.tag_type in (HtmlTagType.OPEN_TAG, HtmlTagType.UNPAIRED_TAG):
                    newdoc.append('<%s>' % element.tag)
                    inserted_comment = True
                elif element.tag_type == HtmlTagType.CLOSE_TAG:
                    newdoc.append('</%s>' % element.tag)
                    inserted_comment = False
            elif element.tag == 'base':
                element.attributes = {}
                newdoc.append(serialize_tag(element))
            else:
                for key, val in element.attributes.copy().items():
                    # Empty intrinsic events
                    if key.startswith('on') or key == "http-equiv":
                        element.attributes[key] = ""
                    elif base and proxy and key == "style" and val is not None:
                        element.attributes[key] = process_css(val, -1, base)
                    elif element.tag in ('frame', 'iframe') and key == 'src':
                        element.attributes[key] = '/static/frames-not-supported.html'
                    # Rewrite javascript URIs
                    elif key in URI_ATTRIBUTES and val is not None:
                            if _contains_js(unscape(val)):
                                element.attributes[key] = "#"
                            elif base and proxy and not (element.tag == "a" and key == 'href'):
                                element.attributes[key] = wrap_url(val, -1,
                                                                   base)
                                element.attributes['_portia_%s' % key] = val
                            elif base:
                                element.attributes[key] = urljoin(base, val)
                newdoc.append(serialize_tag(element))
        else:
            text = doc[element.start:element.end]
            if inserted_comment and text.strip():
                newdoc.append('<!-- Removed by portia -->')
            else:
                newdoc.append(text)

    return ''.join(newdoc)
示例#3
0
def apply_annotations(annotations, target_page, legacy=False):
    selector_annotations, tagid_annotations = _filter_annotations(annotations)
    inserts = defaultdict(list)
    numbered_html = add_tagids(target_page)
    if selector_annotations:
        converted_annotations = apply_selector_annotations(
            selector_annotations, numbered_html)
        tagid_annotations += converted_annotations
    target = iter(parse_html(numbered_html))
    output, tag_stack = [], []
    element = next(target)
    last_id = 0
    # XXX: A dummy element is added to the end so if the last annotation is
    #      generated it will be added to the output
    filtered = defaultdict(list)
    for grouped in tagid_annotations:
        for ann in arg_to_iter(grouped):
            filtered[ann['tagid']].append(ann)
    dummy = [(1e9, [{}])]
    sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] +
                                dummy)
    try:
        for aid, annotation_data in sorted_annotations:
            # Move target until replacement/insertion point
            while True:
                while not isinstance(element, HtmlTag) or element.tag == 'ins':
                    output.append(numbered_html[element.start:element.end])
                    element = next(target)
                if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                    last_id = element.attributes.get(TAGID)
                    tag_stack.append(last_id)
                if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack:
                    if ('__added' not in element.attributes and
                            last_id is not None and aid is not None and
                            int(last_id) < int(aid)):
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    last_inserted = tag_stack.pop()
                    to_insert = inserts.pop(last_inserted, None)
                    if to_insert:
                        output.extend(to_insert)
                        # Skip all nodes up to the next HtmlTag as these
                        # have already been added
                        while True:
                            element = next(target)
                            try:
                                last_id = element.attributes.get(TAGID,
                                                                 last_id)
                            except AttributeError:
                                pass
                            if isinstance(element, HtmlTag):
                                break
                        continue
                if (last_id is not None and aid is not None and
                        int(last_id) < int(aid)):
                    if '__added' not in element.attributes:
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    element = next(target)
                else:
                    break

            generated = []
            next_generated = []
            regular_annotations = []
            # Place generated annotations at the end and sort by slice
            for annotation in sorted(annotation_data, key=_annotation_key):
                if annotation.get('generated'):
                    if annotation.get('insert_after'):
                        next_generated.append(annotation)
                    else:
                        generated.append(annotation)
                else:
                    regular_annotations.append(annotation)
            # Add annotations data as required
            if regular_annotations:
                annotation_info = _gen_annotation_info(regular_annotations,
                                                       legacy)
                for key, val in annotation_info.items():
                    element.attributes[key] = val
            next_text_section = ''
            if generated:
                inner_data, target = tee(target)
                nodes = _get_inner_nodes(inner_data)
                next_text_section = _get_generated_annotation(
                    element, generated, nodes, numbered_html, inserts,
                    legacy)
            if next_generated:
                inner_data, target = tee(target)
                open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1
                nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                         insert_after=True)
                next_text_section = _get_generated_annotation(
                    element, next_generated, nodes, numbered_html, inserts,
                    legacy)

            if '__added' not in element.attributes:
                output.append(serialize_tag(element))
                element.attributes['__added'] = True
            # If an <ins> tag has been inserted we need to move forward
            if next_text_section:
                while True:
                    elem = next(target)
                    if (isinstance(elem, HtmlDataFragment) and
                            elem.is_text_content):
                        break
                    output.append(numbered_html[elem.start:elem.end])
                output.append(next_text_section)
    # Reached the end of the document
    except StopIteration:
        output.append(numbered_html[element.start:element.end])
    else:
        for element in target:
            output.append(numbered_html[element.start:element.end])
    return remove_tagids(''.join(output))
示例#4
0
    def apply(self):
        selector_annotations, tagid_annotations = self.split()
        inserts, numbered_html = defaultdict(list), self.numbered_html
        if selector_annotations:
            converted_annotations = self.apply_selector(selector_annotations)
            tagid_annotations += converted_annotations
        if not self.legacy:
            tagid_annotations = self.verify(
                [arg_to_iter(a) for a in tagid_annotations])
        target = iter(parse_html(numbered_html))
        output, stack = [], []
        elem = next(target)
        last_id = 0
        # XXX: A dummy element is added to the end so if the last annotation is
        #      generated it will be added to the output
        filtered = defaultdict(list)
        for grouped in tagid_annotations:
            for ann in arg_to_iter(grouped):
                filtered[ann['tagid']].append(ann)
        dummy = [(1e9, [{}])]
        sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()
                                     if k is not None])
        try:
            for aid, annotation_data in chain(sorted_annotations, dummy):
                # Move target until replacement/insertion point
                while True:
                    while not isinstance(elem, HtmlTag) or elem.tag == 'ins':
                        output.append(numbered_html[elem.start:elem.end])
                        elem = next(target)
                    if elem.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                        last_id = elem.attributes.get(TAGID)
                        stack.append(last_id)
                    if elem.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and stack:
                        if ('__added' not in elem.attributes and
                                last_id is not None and aid is not None and
                                int(last_id) < int(aid)):
                            output.append(numbered_html[elem.start:elem.end])
                            elem.attributes['__added'] = True
                        last_inserted = stack.pop()
                        to_insert = inserts.pop(last_inserted, None)
                        if to_insert:
                            output.extend(to_insert)
                            # Skip all nodes up to the next HtmlTag as these
                            # have already been added
                            while True:
                                elem = next(target)
                                try:
                                    last_id = elem.attributes.get(TAGID,
                                                                  last_id)
                                except AttributeError:
                                    pass
                                if isinstance(elem, HtmlTag):
                                    break
                            continue
                    if (last_id is not None and aid is not None and
                            int(last_id) < int(aid)):
                        if '__added' not in elem.attributes:
                            output.append(numbered_html[elem.start:elem.end])
                            elem.attributes['__added'] = True
                        elem = next(target)
                    else:
                        break

                generated = []
                next_generated = []
                regular_annotations = []
                # Place generated annotations at the end and sort by slice
                for annotation in sorted(annotation_data, key=_annotation_key):
                    if annotation.get('generated'):
                        if annotation.get('insert_after'):
                            next_generated.append(annotation)
                        else:
                            generated.append(annotation)
                    else:
                        regular_annotations.append(annotation)
                # Add annotations data as required
                if regular_annotations:
                    annotation_info = self.generate(regular_annotations)
                    for key, val in annotation_info.items():
                        elem.attributes[key] = val
                next_text_section = ''
                if generated:
                    inner_data, target = tee(target)
                    nodes = _get_inner_nodes(inner_data)
                    next_text_section = self._get_generated(
                        elem, generated, nodes, inserts)
                if next_generated:
                    inner_data, target = tee(target)
                    open_tags = 0 if elem.tag_type == UNPAIRED_TAG else 1
                    nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                             insert_after=True)
                    next_text_section = self._get_generated(
                        elem, next_generated, nodes, inserts)

                if '__added' not in elem.attributes:
                    output.append(serialize_tag(elem))
                    elem.attributes['__added'] = True
                # If an <ins> tag has been inserted we need to move forward
                if next_text_section:
                    while True:
                        elem = next(target)
                        if (isinstance(elem, HtmlDataFragment) and
                                elem.is_text_content):
                            break
                        output.append(numbered_html[elem.start:elem.end])
                    output.append(next_text_section)
        # Reached the end of the document
        except StopIteration:
            output.append(numbered_html[elem.start:elem.end])
        else:
            for element in target:
                output.append(numbered_html[element.start:element.end])
        return remove_tagids(''.join(output))