def feed(self, raw_data): assert isinstance(raw_data, str), "feed data must be unicode!" data = raw_data.strip() # cut out <pre> and <tt> areas block tag areas data = block_re.sub(self._pre_cut_out, data) data = inline_re.sub(self._pre_cut_out, data) # Delete whitespace from html code data = strip_html(data) if self.debugging: print("_" * 79) print("raw data:") print(repr(raw_data)) print(" -" * 40) print("cleaned data:") print(data) print("-" * 79) # print(clean_data.replace(">", ">\n")) # print("-"*79) HTMLParser.feed(self, data) return self.root
def feed(self, raw_data): assert isinstance(raw_data, unicode), "feed data must be unicode!" data = raw_data.strip() # cut out <pre> and <tt> areas block tag areas data = block_re.sub(self._pre_cut_out, data) data = inline_re.sub(self._pre_cut_out, data) # Delete whitespace from html code data = strip_html(data) if self.debugging: print "_" * 79 print "raw data:" print repr(raw_data) print " -" * 40 print "cleaned data:" print data print "-" * 79 # print clean_data.replace(">", ">\n") # print "-"*79 HTMLParser.feed(self, data) return self.root
def test_remove_linebreak(self): output = strip_html("<strong>foo</strong>\n<ul><li>one</li></ul>") self.assertEqual(output, "<strong>foo</strong><ul><li>one</li></ul>")
def test_not_closed_image_tag(self): output = strip_html('<p>a <img src="/image.jpg"> image.</p>') self.assertEqual(output, '<p>a <img src="/image.jpg"> image.</p>')
def test_remove_linebreak(self): output = strip_html('<strong>foo</strong>\n<ul><li>one</li></ul>') self.assertEqual(output, '<strong>foo</strong><ul><li>one</li></ul>')