Exemplo n.º 1
0
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)
    if opts.xml:
        sys.stdout.write(document.toxml("utf-8"))
    elif opts.tree:
        if not hasattr(document, '__getitem__'): document = [document]
        for fragment in document:
            sys.stdout.write(parser.tree.testSerializer(fragment))
        sys.stdout.write("\n")
    elif opts.hilite:
        sys.stdout.write(document.hilite("utf-8"))
    elif opts.html:
        kwargs = {}
        for opt in serializer.HTMLSerializer.options:
            kwargs[opt] = getattr(opts, opt)
        if not kwargs['quote_char']: del kwargs['quote_char']
        tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
        for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
            sys.stdout.write(text)
        if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i" % pos + " " +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
Exemplo n.º 2
0
def printOutput(parser, document, opts):
    if opts.encoding:
        print("Encoding:", parser.tokenizer.stream.charEncoding)

    for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            tb = opts.treebuilder.lower()
            if tb == "dom":
                document.writexml(sys.stdout, encoding="utf-8")
            elif tb == "lxml":
                import lxml.etree
                sys.stdout.write(
                    lxml.etree.tostring(document, encoding="unicode"))
            elif tb == "etree":
                sys.stdout.write(
                    _utils.default_etree.tostring(document,
                                                  encoding="unicode"))
        elif opts.tree:
            if not hasattr(document, '__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts, opt)
                except Exception:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            if opts.sanitize:
                kwargs["sanitize"] = True

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = "utf-8"
            for text in serializer.HTMLSerializer(**kwargs).serialize(
                    tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'):
                sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append("Line %i Col %i" % pos + " " +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
Exemplo n.º 3
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = html5lib.HTMLParser(tokenizer=HTMLSanitizer,
                            tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
Exemplo n.º 4
0
Arquivo: parse.py Projeto: xrile/fjord
def printOutput(parser, document, opts):
    if opts.encoding:
        print('Encoding:', parser.tokenizer.stream.charEncoding)

    for item in parser.log:
        print(item)

    if document is not None:
        if opts.xml:
            tb = opts.treebuilder.lower()
            if tb == 'dom':
                document.writexml(sys.stdout, encoding='utf-8')
            elif tb == 'lxml':
                import lxml.etree
                sys.stdout.write(lxml.etree.tostring(document))
            elif tb == 'etree':
                sys.stdout.write(utils.default_etree.tostring(document))
        elif opts.tree:
            if not hasattr(document, '__getitem__'):
                document = [document]
            for fragment in document:
                print(parser.tree.testSerializer(fragment))
        elif opts.hilite:
            sys.stdout.write(document.hilite('utf-8'))
        elif opts.html:
            kwargs = {}
            for opt in serializer.HTMLSerializer.options:
                try:
                    kwargs[opt] = getattr(opts, opt)
                except:
                    pass
            if not kwargs['quote_char']:
                del kwargs['quote_char']

            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
            if sys.version_info[0] >= 3:
                encoding = None
            else:
                encoding = 'utf-8'
            for text in serializer.HTMLSerializer(**kwargs).serialize(
                    tokens, encoding=encoding):
                sys.stdout.write(text)
            if not text.endswith('\n'): sys.stdout.write('\n')
    if opts.error:
        errList = []
        for pos, errorcode, datavars in parser.errors:
            errList.append('Line %i Col %i' % pos + ' ' +
                           constants.E.get(errorcode, 'Unknown error "%s"' %
                                           errorcode) % datavars)
        sys.stdout.write('\nParse errors:\n' + '\n'.join(errList) + '\n')
Exemplo n.º 5
0
def clean_html(data, full=False, parser=DEFAULT_PARSER):
    """
    Cleans HTML from XSS vulnerabilities using html5lib
    If full is False, only the contents inside <body> will be returned (without
    the <body> tags).
    """
    if full:
        dom_tree = parser.parse(data)
    else:
        dom_tree = parser.parseFragment(data)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(
        omit_optional_tags=False, quote_attr_values="always", sanitize=True
    )
    return u"".join(s.serialize(stream))
Exemplo n.º 6
0
def clean_html(data, full=True, parser=DEFAULT_PARSER):
    """
    Cleans HTML from XSS vulnerabilities using html5lib
    If full is False, only the contents inside <body> will be returned (without
    the <body> tags).
    """
    if full:
        dom_tree = parser.parse(data)
    else:
        dom_tree = parser.parseFragment(data)
    walker = treewalkers.getTreeWalker('dom')
    kwargs = _filter_kwargs()
    stream = TextSanitizer(walker(dom_tree), **kwargs)
    s = serializer.HTMLSerializer(
        omit_optional_tags=False,
        quote_attr_values='always',
    )
    return u''.join(s.serialize(stream))
Exemplo n.º 7
0
def html_sanitize(text):
	if not text:
		return ''
	p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
	element = p.parseFragment(text)
	walker = getTreeWalker("etree")
	stream = walker(element)
	s = serializer.HTMLSerializer()
	text = s.render(stream)
	text = UnicodeDammit(text, ["utf-8"])
	REMOVE_ATTRIBUTES = [
		'lang','language','onmouseover','onmouseout','script','font','style',
		'dir','face','size','color','style','class','width','height','hspace',
		'border','valign','align','background','bgcolor','text','link','vlink',
		'alink','cellpadding','cellspacing', 'id']

	soup = BeautifulSoup(text.unicode_markup)
	for attribute in REMOVE_ATTRIBUTES:
		for tag in soup.findAll():

			if(attribute == 'style'):
				new_style = ''
				style = tag.attrs.get('style', None)
				if style:
					if style.find('normal') != -1: new_style += " font-weight:normal; "
					elif style.find('bold') != -1: new_style += " font-weight:bold; "
					if style.find('italic') != -1: new_style += " font-style: italic; "
					if style.find('underline') != -1: new_style += " text-decoration: underline; "
					tag.attrs['style'] = new_style

			else:
				del(tag[attribute])

	html = soup.prettify('utf-8')
	try:
		body = re.findall(r'<body>(.*)</body>', html, re.S)[0].strip()
	except IndexError:
		body = html
	return body
Exemplo n.º 8
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang') >= 0: tag = 'language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html') >= 0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
        now = time.gmtime()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        for entry in data.entries:
            if entry.has_key('published_parsed') and entry['published_parsed']:
                if entry['published_parsed'] > now:
                    del entry['published_parsed']
                    del entry['published']
            if entry.has_key('updated_parsed') and entry['updated_parsed']:
                if entry['updated_parsed'] > now:
                    del entry['updated_parsed']
                    del entry['updated']
    elif future_dates == 'ignore_entry':
        now = time.time()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        data.entries = [
            entry for entry in data.entries
            if (not entry.has_key('published_parsed')
                or not entry['published_parsed']
                or entry['published_parsed'] <= now) and
            (not entry.has_key('updated_parsed') or not entry['updated_parsed']
             or entry['updated_parsed'] <= now)
        ]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content' and not entry.has_key('content_detail'):
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue

            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue

            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)

                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

            if node['value']:
                # Run this through HTML5's sanitizer
                doc = None
                if 'xhtml' in node['type']:
                    try:
                        from xml.dom import minidom
                        doc = minidom.parseString(node['value'])
                    except:
                        node['type'] = 'text/html'

                if not doc:
                    from html5lib import html5parser, treebuilders, sanitizer
                    p = html5parser.HTMLParser(
                        tree=treebuilders.getTreeBuilder('dom'),
                        tokenizer=sanitizer.HTMLSanitizer)
                    doc = p.parseFragment(node['value'], encoding='utf-8')

                from html5lib import treewalkers, serializer
                walker = treewalkers.getTreeWalker('dom')(doc)
                xhtml = serializer.HTMLSerializer(inject_meta_charset=False)
                tree = xhtml.serialize(walker, encoding='utf-8')
                node['value'] = ''.join([str(token) for token in tree])
def serialize_html(input, options):
    options = dict([(str(k), v) for k, v in options.items()])
    stream = AlphabeticalAttributesFilter(JsonWalker(input))
    return serializer.HTMLSerializer(**options).render(
        stream, options.get("encoding", None))
Exemplo n.º 10
0
 def setUp(self):
     self.parser = etree.XMLParser(resolve_entities=False)
     self.treewalker = html5lib.getTreeWalker("lxml")
     self.serializer = serializer.HTMLSerializer()
Exemplo n.º 11
0
def serialize_html(input, options):
    options = dict([(str(k), v) for k, v in options.items()])
    return serializer.HTMLSerializer(**options).render(
        JsonWalker(input), options.get("encoding", None))
Exemplo n.º 12
0
# -*- coding:utf-8 -*-
import re

from django.utils.html import escape
from html5lib import HTMLParser, serializer, treewalkers

WWW_PATTERN = re.compile(r'(^|\s|\(|\[|\<|\:)www\.', re.UNICODE)
FTP_PATTERN = re.compile(r'(^|\s|\(|\[|\<|\:)ftp\.', re.UNICODE)
PROTOCOL_PATTERN = re.compile(
    r'(http://|ftp://|mailto:|https://)(.*?)([\.\,\?\!\)]*?)(\s|&gt;|&lt;|&quot;|$)'
)

_parser = HTMLParser()
_parse = _parser.parseFragment
_serializer = serializer.HTMLSerializer()
_tree_walker = treewalkers.getTreeWalker('simpletree')
_serialize = lambda doc: u''.join(_serializer.serialize(_tree_walker(doc))
                                  ) if doc.childNodes else u''


def usertext(value):
    doc = _parse(value)

    def urlify(s):
        s = re.sub(WWW_PATTERN, r'\1http://www.', s)
        s = re.sub(FTP_PATTERN, r'\1ftp://ftp.', s)
        s = re.sub(PROTOCOL_PATTERN, r'<a href="\1\2">\1\2</a>\3\4', s)
        return s

    def has_parents(node, tags):
        if node is None:
Exemplo n.º 13
0
 def serialize_html(self, input, options):
     options = dict([(str(k), v) for k, v in options.iteritems()])
     return u''.join(
         serializer.HTMLSerializer(**options).serialize(
             JsonWalker(input), options.get("encoding", None)))