Пример #1
0
    def sanitize_characters(self, token):
        """Handles Characters tokens

        Our overridden tokenizer doesn't do anything with entities. However,
        that means that the serializer will convert all ``&`` in Characters
        tokens to ``&``.

        Since we don't want that, we extract entities here and convert them to
        Entity tokens so the serializer will let them be.

        :arg token: the Characters token to work on

        :returns: a list of tokens

        """
        data = token.get('data', '')

        if not data:
            return token

        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
        token['data'] = data

        # If there isn't a & in the data, we can return now
        if '&' not in data:
            return token

        new_tokens = []

        # For each possible entity that starts with a "&", we try to extract an
        # actual entity and re-tokenize accordingly
        for part in html5lib_shim.next_possible_entity(data):
            if not part:
                continue

            if part.startswith('&'):
                entity = html5lib_shim.match_entity(part)
                if entity is not None:
                    if entity == 'amp':
                        # LinkifyFilter can't match urls across token boundaries
                        # which is problematic with & since that shows up in
                        # querystrings all the time. This special-cases &
                        # and converts it to a & and sticks it in as a
                        # Characters token. It'll get merged with surrounding
                        # tokens in the BleachSanitizerfilter.__iter__ and
                        # escaped in the serializer.
                        new_tokens.append({'type': 'Characters', 'data': '&'})
                    else:
                        new_tokens.append({'type': 'Entity', 'name': entity})

                    # Length of the entity plus 2--one for & at the beginning
                    # and one for ; at the end
                    remainder = part[len(entity) + 2:]
                    if remainder:
                        new_tokens.append({'type': 'Characters', 'data': remainder})
                    continue

            new_tokens.append({'type': 'Characters', 'data': part})

        return new_tokens
Пример #2
0
    def sanitize_characters(self, token):
        """Handles Characters tokens

        Our overridden tokenizer doesn't do anything with entities. However,
        that means that the serializer will convert all ``&`` in Characters
        tokens to ``&``.

        Since we don't want that, we extract entities here and convert them to
        Entity tokens so the serializer will let them be.

        :arg token: the Characters token to work on

        :returns: a list of tokens

        """
        data = token.get('data', '')

        if not data:
            return token

        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
        token['data'] = data

        # If there isn't a & in the data, we can return now
        if '&' not in data:
            return token

        new_tokens = []

        # For each possible entity that starts with a "&", we try to extract an
        # actual entity and re-tokenize accordingly
        for part in html5lib_shim.next_possible_entity(data):
            if not part:
                continue

            if part.startswith('&'):
                entity = html5lib_shim.match_entity(part)
                if entity is not None:
                    if entity == 'amp':
                        # LinkifyFilter can't match urls across token boundaries
                        # which is problematic with & since that shows up in
                        # querystrings all the time. This special-cases &
                        # and converts it to a & and sticks it in as a
                        # Characters token. It'll get merged with surrounding
                        # tokens in the BleachSanitizerfilter.__iter__ and
                        # escaped in the serializer.
                        new_tokens.append({'type': 'Characters', 'data': '&'})
                    else:
                        new_tokens.append({'type': 'Entity', 'name': entity})

                    # Length of the entity plus 2--one for & at the beginning
                    # and and one for ; at the end
                    remainder = part[len(entity) + 2:]
                    if remainder:
                        new_tokens.append({'type': 'Characters', 'data': remainder})
                    continue

            new_tokens.append({'type': 'Characters', 'data': part})

        return new_tokens
Пример #3
0
    def sanitize_characters(self, token):
        """Handles Characters tokens

        Our overridden tokenizer doesn't do anything with entities. However,
        that means that the serializer will convert all ``&`` in Characters
        tokens to ``&``.

        Since we don't want that, we extract entities here and convert them to
        Entity tokens so the serializer will let them be.

        :arg token: the Characters token to work on

        :returns: a list of tokens

        """
        data = token.get('data', '')

        if not data:
            return token

        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
        token['data'] = data

        # If there isn't a & in the data, we can return now
        if '&' not in data:
            return token

        new_tokens = []

        # For each possible entity that starts with a "&", we try to extract an
        # actual entity and re-tokenize accordingly
        for part in html5lib_shim.next_possible_entity(data):
            if not part:
                continue

            if part.startswith('&'):
                entity = html5lib_shim.match_entity(part)
                if entity is not None:
                    new_tokens.append({'type': 'Entity', 'name': entity})
                    # Length of the entity plus 2--one for & at the beginning
                    # and and one for ; at the end
                    remainder = part[len(entity) + 2:]
                    if remainder:
                        new_tokens.append({
                            'type': 'Characters',
                            'data': remainder
                        })
                    continue

            new_tokens.append({'type': 'Characters', 'data': part})

        return new_tokens
Пример #4
0
    def sanitize_characters(self, token):
        """Handles Characters tokens

        Our overridden tokenizer doesn't do anything with entities. However,
        that means that the serializer will convert all ``&`` in Characters
        tokens to ``&``.

        Since we don't want that, we extract entities here and convert them to
        Entity tokens so the serializer will let them be.

        :arg token: the Characters token to work on

        :returns: a list of tokens

        """
        data = token.get('data', '')

        if not data:
            return token

        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
        token['data'] = data

        # If there isn't a & in the data, we can return now
        if '&' not in data:
            return token

        new_tokens = []

        # For each possible entity that starts with a "&", we try to extract an
        # actual entity and re-tokenize accordingly
        for part in html5lib_shim.next_possible_entity(data):
            if not part:
                continue

            if part.startswith('&'):
                entity = html5lib_shim.match_entity(part)
                if entity is not None:
                    new_tokens.append({'type': 'Entity', 'name': entity})
                    # Length of the entity plus 2--one for & at the beginning
                    # and and one for ; at the end
                    remainder = part[len(entity) + 2:]
                    if remainder:
                        new_tokens.append({'type': 'Characters', 'data': remainder})
                    continue

            new_tokens.append({'type': 'Characters', 'data': part})

        return new_tokens