Пример #1
0
def plaintext(text, keeplinebreaks=True):
    if isinstance(text, Fragment):
        text = text.generate().render('text', encoding=None)
    else:
        text = stripentities(striptags(text))
    if not keeplinebreaks:
        text = text.replace(u'\n', u' ')
    return text
Пример #2
0
    def handle_starttag(self, tag, attrib):
        fixed_attrib = []
        for name, value in attrib:  # Fixup minimized attributes
            if value is None:
                value = name
            fixed_attrib.append((QName(name), stripentities(value)))

        self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
        if tag in self._EMPTY_ELEMS:
            self._enqueue(END, QName(tag))
        else:
            self._open_tags.append(tag)
Пример #3
0
def plaintext(text, keeplinebreaks=True):
    """Extract the text elements from (X)HTML content

    :param text: `unicode` or `genshi.builder.Fragment`
    :param keeplinebreaks: optionally keep linebreaks
    """
    if isinstance(text, Fragment):
        text = text.generate().render('text', encoding=None)
    else:
        text = stripentities(striptags(text))
    if not keeplinebreaks:
        text = text.replace(u'\n', u' ')
    return text
Пример #4
0
def plaintext(text, keeplinebreaks=True):
    """Extract the text elements from (X)HTML content

    :param text: `unicode` or `genshi.builder.Fragment`
    :param keeplinebreaks: optionally keep linebreaks
    """
    if isinstance(text, Fragment):
        text = text.generate().render('text', encoding=None)
    else:
        text = stripentities(striptags(text))
    if not keeplinebreaks:
        text = text.replace(u'\n', u' ')
    return text
Пример #5
0
    def handle_starttag(self, tag, attrib):
        fixed_attrib = []
        for name, value in attrib: # Fixup minimized attributes
            if value is None:
                value = str(name)
            elif not isinstance(value, str):
                value = value.decode(self.encoding, 'replace')
            fixed_attrib.append((QName(name), stripentities(value)))

        self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
        if tag in self._EMPTY_ELEMS:
            self._enqueue(END, QName(tag))
        else:
            self._open_tags.append(tag)
Пример #6
0
    def __call__(self, stream):
        """Apply the filter to the given stream.
        
        :param stream: the markup event stream to filter
        """
        waiting_for = None

        for kind, data, pos in stream:
            if kind is START:
                if waiting_for:
                    continue
                tag, attrs = data
                if not self.is_safe_elem(tag, attrs):
                    waiting_for = tag
                    continue

                new_attrs = []
                for attr, value in attrs:
                    value = stripentities(value)
                    if attr not in self.safe_attrs:
                        continue
                    elif attr in self.uri_attrs:
                        # Don't allow URI schemes such as "javascript:"
                        if not self.is_safe_uri(value):
                            continue
                    elif attr == 'style':
                        # Remove dangerous CSS declarations from inline styles
                        decls = self.sanitize_css(value)
                        if not decls:
                            continue
                        value = '; '.join(decls)
                    new_attrs.append((attr, value))

                yield kind, (tag, Attrs(new_attrs)), pos

            elif kind is END:
                tag = data
                if waiting_for:
                    if waiting_for == tag:
                        waiting_for = None
                else:
                    yield kind, data, pos

            elif kind is not COMMENT:
                if not waiting_for:
                    yield kind, data, pos
Пример #7
0
    def __call__(self, stream):
        """Apply the filter to the given stream.
        
        :param stream: the markup event stream to filter
        """
        waiting_for = None

        for kind, data, pos in stream:
            if kind is START:
                if waiting_for:
                    continue
                tag, attrs = data
                if not self.is_safe_elem(tag, attrs):
                    waiting_for = tag
                    continue

                new_attrs = []
                for attr, value in attrs:
                    value = stripentities(value)
                    if attr not in self.safe_attrs:
                        continue
                    elif attr in self.uri_attrs:
                        # Don't allow URI schemes such as "javascript:"
                        if not self.is_safe_uri(value):
                            continue
                    elif attr == 'style':
                        # Remove dangerous CSS declarations from inline styles
                        decls = self.sanitize_css(value)
                        if not decls:
                            continue
                        value = '; '.join(decls)
                    new_attrs.append((attr, value))

                yield kind, (tag, Attrs(new_attrs)), pos

            elif kind is END:
                tag = data
                if waiting_for:
                    if waiting_for == tag:
                        waiting_for = None
                else:
                    yield kind, data, pos

            elif kind is not COMMENT:
                if not waiting_for:
                    yield kind, data, pos
Пример #8
0
 def _to_python(self, value, state=None):
     value = super(MarkupConverter, self)._to_python(value, state)
     if value:
         value = Markup(stripentities(value)).unescape()
         return self.cleaner('', value)
Пример #9
0
 def _to_python(self, value, state=None):
     value = super(MarkupConverter, self)._to_python(value, state)
     if value:
         value = Markup(stripentities(value)).unescape()
         return self.cleaner('', value)
Пример #10
0
    def __call__(self, stream):
        """Apply the filter to the given stream.
        
        :param stream: the markup event stream to filter
        """
        waiting_for = None

        def _get_scheme(href):
            if ':' not in href:
                return None
            chars = [char for char in href.split(':', 1)[0] if char.isalnum()]
            return ''.join(chars).lower()

        for kind, data, pos in stream:
            if kind is START:
                if waiting_for:
                    continue
                tag, attrs = data
                if tag not in self.safe_tags:
                    waiting_for = tag
                    continue

                new_attrs = []
                for attr, value in attrs:
                    value = stripentities(value)
                    if attr not in self.safe_attrs:
                        continue
                    elif attr in self.uri_attrs:
                        # Don't allow URI schemes such as "javascript:"
                        if _get_scheme(value) not in self.safe_schemes:
                            continue
                    elif attr == 'style':
                        # Remove dangerous CSS declarations from inline styles
                        decls = []
                        value = self._replace_unicode_escapes(value)
                        for decl in filter(None, value.split(';')):
                            is_evil = False
                            if 'expression' in decl:
                                is_evil = True
                            for m in re.finditer(r'url\s*\(([^)]+)', decl):
                                if _get_scheme(m.group(1)) not in self.safe_schemes:
                                    is_evil = True
                                    break
                            if not is_evil:
                                decls.append(decl.strip())
                        if not decls:
                            continue
                        value = '; '.join(decls)
                    new_attrs.append((attr, value))

                yield kind, (tag, Attrs(new_attrs)), pos

            elif kind is END:
                tag = data
                if waiting_for:
                    if waiting_for == tag:
                        waiting_for = None
                else:
                    yield kind, data, pos

            else:
                if not waiting_for:
                    yield kind, data, pos