def html_parsed(self): """ Given the infobox html or as soup, return a list of (key, value) pairs. """ def escape_lists(val): if not val: return u"" return re.sub( r"<\s*(/?\s*(br\s*/?|/?ul|/?li))\s*>", "<\\1>", val) def unescape_lists(val): if not val: return u"" val = re.sub(r"<(/?\s*(br\s*/?|ul|li))>", "<\\1>", val) return val soup = fromstring(self.html_source()) # Render all tags except <ul> and <li> and <br>. Escape them # in some way and then reparse tpairs = [] for row in soup.findall('.//tr'): try: e_key, e_val = row.findall('./*')[:2] except ValueError: continue if e_key is not None and e_val is not None: # Turn the key into xml string, parse the other tags # making brs into newlines, parse the rest of the # tags, get the text back key = totext(fromstring(tostring(e_key), True)) key = re.sub(r"\s+", " ", key).strip() val = escape_lists(tostring(e_val)) # Extract text val = fromstring(val) val = totext(val) val = unescape_lists(val.strip()) tpairs.append((key, val)) return tpairs
def _html_infoboxes(self, html): """ A list of rendered infobox-like tables. We find rendered infoboxes by looking for a <table> element with class "infobox" Unfortunately, non-infobox tables such as sidebars might also match this criteria. Until Wikipedia uses a CSS class specifically for infoboxes we don't have a better way of selecting them. """ bs = fromstring(html) return [t for t in bs.findall(".//table") if 'infobox' in t.get('class', '')]
def test_fromstringtotext(self): self.assertEqual(util.totext(util.fromstring("hello<br/>")), "hello") self.assertEqual(util.totext(util.fromstring("<br/>", True)), "\n")
def test_html(self): html = "<html> <body><p>yes</p> <p> hi</p> <img/> </body> </html>" el = util.fromstring(html) self.assertEqual("yes hi", util.totext(el).strip()) self.assertIn("<p>", util.tostring(el))
def _soup(self): if not hasattr(self, '__soup'): self.__soup = fromstring(self.html_source()) return self.__soup