예제 #1
0
    def feed(self, raw_data):
        assert isinstance(raw_data, str), "feed data must be unicode!"
        data = raw_data.strip()

        # cut out <pre> and <tt> areas block tag areas
        data = block_re.sub(self._pre_cut_out, data)
        data = inline_re.sub(self._pre_cut_out, data)

        # Delete whitespace from html code
        data = strip_html(data)

        if self.debugging:
            print("_" * 79)
            print("raw data:")
            print(repr(raw_data))
            print(" -" * 40)
            print("cleaned data:")
            print(data)
            print("-" * 79)


#            print(clean_data.replace(">", ">\n"))
#            print("-"*79)

        HTMLParser.feed(self, data)

        return self.root
예제 #2
0
파일: parser.py 프로젝트: jurev/magicbawl
    def feed(self, raw_data):
        assert isinstance(raw_data, unicode), "feed data must be unicode!"
        data = raw_data.strip()

        # cut out <pre> and <tt> areas block tag areas
        data = block_re.sub(self._pre_cut_out, data)
        data = inline_re.sub(self._pre_cut_out, data)

        # Delete whitespace from html code
        data = strip_html(data)

        if self.debugging:
            print "_" * 79
            print "raw data:"
            print repr(raw_data)
            print " -" * 40
            print "cleaned data:"
            print data
            print "-" * 79
#            print clean_data.replace(">", ">\n")
#            print "-"*79

        HTMLParser.feed(self, data)

        return self.root
예제 #3
0
 def test_remove_linebreak(self):
     output = strip_html("<strong>foo</strong>\n<ul><li>one</li></ul>")
     self.assertEqual(output, "<strong>foo</strong><ul><li>one</li></ul>")
예제 #4
0
 def test_not_closed_image_tag(self):
     output = strip_html('<p>a <img src="/image.jpg"> image.</p>')
     self.assertEqual(output, '<p>a <img src="/image.jpg"> image.</p>')
예제 #5
0
 def test_remove_linebreak(self):
     output = strip_html('<strong>foo</strong>\n<ul><li>one</li></ul>')
     self.assertEqual(output, '<strong>foo</strong><ul><li>one</li></ul>')
예제 #6
0
 def test_not_closed_image_tag(self):
     output = strip_html('<p>a <img src="/image.jpg"> image.</p>')
     self.assertEqual(output, '<p>a <img src="/image.jpg"> image.</p>')