示例#1
0
    def set_up_substitutions(self, tag):
        # We are only interested in <meta> tags
        if tag.name != 'meta':
            return False

        http_equiv = tag.get('http-equiv')
        content = tag.get('content')
        charset = tag.get('charset')

        # We are interested in <meta> tags that say what encoding the
        # document was originally in. This means HTML 5-style <meta>
        # tags that provide the "charset" attribute. It also means
        # HTML 4-style <meta> tags that provide the "content"
        # attribute and have "http-equiv" set to "content-type".
        #
        # In both cases we will replace the value of the appropriate
        # attribute with a standin object that can take on any
        # encoding.
        meta_encoding = None
        if charset is not None:
            # HTML 5 style:
            # <meta charset="utf8">
            meta_encoding = charset
            tag['charset'] = CharsetMetaAttributeValue(charset)

        elif (content is not None and http_equiv is not None
              and http_equiv.lower() == 'content-type'):
            # HTML 4 style:
            # <meta http-equiv="content-type" content="text/html; charset=utf8">
            tag['content'] = ContentMetaAttributeValue(content)

        return (meta_encoding is not None)
    def set_up_substitutions(self, tag):
        """Replace the declared encoding in a <meta> tag with a placeholder,
        to be substituted when the tag is output to a string.

        An HTML document may come in to Beautiful Soup as one
        encoding, but exit in a different encoding, and the <meta> tag
        needs to be changed to reflect this.

        :param tag: A `Tag`
        :return: Whether or not a substitution was performed.
        """
        # We are only interested in <meta> tags
        if tag.name != "meta":
            return False

        http_equiv = tag.get("http-equiv")
        content = tag.get("content")
        charset = tag.get("charset")

        # We are interested in <meta> tags that say what encoding the
        # document was originally in. This means HTML 5-style <meta>
        # tags that provide the "charset" attribute. It also means
        # HTML 4-style <meta> tags that provide the "content"
        # attribute and have "http-equiv" set to "content-type".
        #
        # In both cases we will replace the value of the appropriate
        # attribute with a standin object that can take on any
        # encoding.
        meta_encoding = None
        if charset is not None:
            # HTML 5 style:
            # <meta charset="utf8">
            meta_encoding = charset
            tag["charset"] = CharsetMetaAttributeValue(charset)

        elif (
            content is not None
            and http_equiv is not None
            and http_equiv.lower() == "content-type"
        ):
            # HTML 4 style:
            # <meta http-equiv="content-type" content="text/html; charset=utf8">
            tag["content"] = ContentMetaAttributeValue(content)

        return meta_encoding is not None
示例#3
0
 def test_content_meta_attribute_value(self):
     value = ContentMetaAttributeValue("text/html; charset=euc-jp")
     self.assertEqual("text/html; charset=euc-jp", value)
     self.assertEqual("text/html; charset=euc-jp", value.original_value)
     self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
示例#4
0
 def test_content_meta_attribute_value(self):
     value = ContentMetaAttributeValue("text/html; charset=euc-jp")
     self.assertEqual("text/html; charset=euc-jp", value)
     self.assertEqual("text/html; charset=euc-jp", value.original_value)
     self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
示例#5
0
 def test_content_meta_attribute_value(self):
     value = ContentMetaAttributeValue("text/html; charset=euc-jp")
     assert "text/html; charset=euc-jp" == value
     assert "text/html; charset=euc-jp" == value.original_value
     assert "text/html; charset=utf8" == value.encode("utf8")
     assert "text/html; charset=ascii" == value.encode("ascii")