def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise ValueError( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup, self.original_encoding, self.declared_html_encoding = ( self.builder.prepare_markup(markup, from_encoding)) try: self._feed() except StopParsing: pass # Clear out the markup and the builder so they can be CGed. self.markup = None self.builder.soup = None self.builder = None
def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " "to Unicode characters.") if 'markupMassage' in kwargs: del kwargs['markupMassage'] warnings.warn( "BS4 does not respect the markupMassage argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for any necessary markup massage.") if 'smartQuotesTo' in kwargs: del kwargs['smartQuotesTo'] warnings.warn( "BS4 does not respect the smartQuotesTo argument to the " "BeautifulSoup constructor. Smart quotes are always converted " "to Unicode characters.") if 'selfClosingTags' in kwargs: del kwargs['selfClosingTags'] warnings.warn( "BS4 does not respect the selfClosingTags argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for understanding self-closing tags.") if 'isHTML' in kwargs: del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " "BeautifulSoup constructor. You can pass in features='html' " "or features='xml' to get a builder capable of handling " "one or the other.") def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument( "parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise ValueError( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) = ( self.builder.prepare_markup(markup, from_encoding)) try: self._feed() except StopParsing: pass # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None self.builder.soup = None