示例#1
0
    def __init__(self,
                 markup="",
                 features=None,
                 builder=None,
                 parse_only=None,
                 from_encoding=None):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""

        if builder is None:
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
            builder_class = builder_registry.lookup(*features)
            if builder_class is None:
                raise ValueError(
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?" %
                    ",".join(features))
            builder = builder_class()
        self.builder = builder
        self.is_xml = builder.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        self.reset()

        if hasattr(markup, 'read'):  # It's a file-type object.
            markup = markup.read()
        self.markup, self.original_encoding, self.declared_html_encoding = (
            self.builder.prepare_markup(markup, from_encoding))

        try:
            self._feed()
        except StopParsing:
            pass

        # Clear out the markup and the builder so they can be CGed.
        self.markup = None
        self.builder.soup = None
        self.builder = None
示例#2
0
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""

        if builder is None:
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
            builder_class = builder_registry.lookup(*features)
            if builder_class is None:
                raise ValueError(
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
        self.builder = builder
        self.is_xml = builder.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        self.reset()

        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
        self.markup, self.original_encoding, self.declared_html_encoding = (
            self.builder.prepare_markup(markup, from_encoding))

        try:
            self._feed()
        except StopParsing:
            pass

        # Clear out the markup and the builder so they can be CGed.
        self.markup = None
        self.builder.soup = None
        self.builder = None
示例#3
0
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, **kwargs):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""

        if 'convertEntities' in kwargs:
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
                "to Unicode characters.")

        if 'markupMassage' in kwargs:
            del kwargs['markupMassage']
            warnings.warn(
                "BS4 does not respect the markupMassage argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for any necessary markup massage.")

        if 'smartQuotesTo' in kwargs:
            del kwargs['smartQuotesTo']
            warnings.warn(
                "BS4 does not respect the smartQuotesTo argument to the "
                "BeautifulSoup constructor. Smart quotes are always converted "
                "to Unicode characters.")

        if 'selfClosingTags' in kwargs:
            del kwargs['selfClosingTags']
            warnings.warn(
                "BS4 does not respect the selfClosingTags argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for understanding self-closing tags.")

        if 'isHTML' in kwargs:
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
                "BeautifulSoup constructor. You can pass in features='html' "
                "or features='xml' to get a builder capable of handling "
                "one or the other.")

        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
                warnings.warn(
                    'The "%s" argument to the BeautifulSoup constructor '
                    'has been renamed to "%s."' % (old_name, new_name))
                value = kwargs[old_name]
                del kwargs[old_name]
                return value
            return None

        parse_only = parse_only or deprecated_argument(
            "parseOnlyThese", "parse_only")

        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")

        if len(kwargs) > 0:
            arg = kwargs.keys().pop()
            raise TypeError(
                "__init__() got an unexpected keyword argument '%s'" % arg)

        if builder is None:
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
            builder_class = builder_registry.lookup(*features)
            if builder_class is None:
                raise ValueError(
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
        self.builder = builder
        self.is_xml = builder.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        self.reset()

        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
        (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) = (
            self.builder.prepare_markup(markup, from_encoding))

        try:
            self._feed()
        except StopParsing:
            pass

        # Clear out the markup and remove the builder's circular
        # reference to this object.
        self.markup = None
        self.builder.soup = None