def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder obj = BeautifulSoup(to_parse, builder=builder) if compare_parsed_to is None: compare_parsed_to = to_parse self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
def __init__(self, namespaceHTMLElements, soup=None): if soup: self.soup = soup else: from contrib.bs4 import BeautifulSoup self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def Login( self, login, password ): # Don't login twice. if not self._is_loggedin: if not login: raise RuntimeError( self._errmsg_cannot_login + " " + self._errmsg_empty_email ) if not password: raise RuntimeError( self._errmsg_cannot_login + " " + self._errmsg_empty_pass ) login_data = urllib.urlencode( { "email" : login, "pass" : password } ) response = self._opener.open( _COSMIC_LOGIN_URL, login_data ) htmldata = BeautifulSoup( response.read(), features="html.parser" ) login_info = htmldata.find( "dd", { "class": "login-error" } ) if login_info is not None: vispass = None if not password else "*" * len( password ) errormsg = self._errmsg_cannot_login + " " + \ ( self._errmsg_credentials % ( login, vispass ) ) servermsg = "" for msg in htmldata.find_all( "h3" ): servermsg += msg.string + "." if servermsg is not None: errormsg += " " + ( self._errmsg_servermsg % servermsg ) raise RuntimeError( errormsg ) self._is_loggedin = True
def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ <script type="text/javascript"> </script> """ soup = BeautifulSoup(doc, "lxml-xml") # lxml would have stripped this while parsing, but we can add # it later. soup.script.string = 'console.log("< < hey > > ");' encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded)
def FindGeneID( self, name ): if not self._is_loggedin: raise RuntimeError( self._errmsg_no_login ) if name is None: raise RuntimeError( self._errmsg_empty_gene ) data = self._opener.open( _COSMIC_SEARCH_URL % name ) htmldata = BeautifulSoup( data.read(), features="html.parser" ) ids = htmldata.find_all( "input", { "name": "id", "type": "hidden" } ) lns = htmldata.find_all( "input", { "name": "ln", "type": "hidden" } ) id_count = len( ids ) if id_count == 0: raise RuntimeError( self._errmsg_invalid_gene ) for i in range( id_count ): if lns[i].get( "value" ) != name: continue return int( ids[i].get( "value" ) ) raise RuntimeError( self._errmsg_parse_error )
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
def fragmentClass(self): from contrib.bs4 import BeautifulSoup self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None)
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def __init__(self, namespaceHTMLElements, soup=None): if soup: self.soup = soup else: from contrib.bs4 import BeautifulSoup self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): from contrib.bs4 import BeautifulSoup self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): # XXX This code is not covered by the BS4 tests. self.soup.append(node.element) def getDocument(self): return self.soup def getFragment(self): return treebuilder_base.TreeBuilder.getFragment(self).element def testSerializer(self, element): from contrib.bs4 import BeautifulSoup rv = [] doctype_re = re.compile( r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') def serializeElement(element, indent=0): if isinstance(element, BeautifulSoup): pass if isinstance(element, Doctype): m = doctype_re.match(element) if m: name = m.group(1) if m.lastindex > 1: publicId = m.group(2) or "" systemId = m.group(3) or m.group(4) or "" rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % (' ' * indent, name, publicId, systemId)) else: rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) else: rv.append("|%s<!DOCTYPE >" % (' ' * indent, )) elif isinstance(element, Comment): rv.append("|%s<!-- %s -->" % (' ' * indent, element)) elif isinstance(element, NavigableString): rv.append("|%s\"%s\"" % (' ' * indent, element)) else: if element.namespace: name = "%s %s" % (prefixes[element.namespace], element.name) else: name = element.name rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] for name, value in element.attrs.items(): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): value = " ".join(value) attributes.append((name, value)) for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) indent += 2 for child in element.children: serializeElement(child, indent) serializeElement(element, 0) return "\n".join(rv)
try: if os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data with open(data) as fp: data = fp.read() except ValueError: # This can happen on some platforms when the 'filename' is # too long. Assume it's data and not a filename. pass print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: soup = BeautifulSoup(data, features=parser) success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "Here's what %s did with the markup:" % parser print soup.prettify() print "-" * 80 def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. This lets you see how lxml parses a document when no Beautiful Soup code is running.
def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs)