Пример #1
0
 def setUp(self):
     self.logger = logging.getLogger("TestStructuredExtractor")
     html = ' <html> <div id="header"><h1>hello world</h1>' \
            '</div><div id="content"><p>this is important</p>' \
            '<p> study computing it is fun</p></div>' \
            '<div id="footer"> <h2>byes</h2></div> ' \
            '<div id="post"> stay <div id="sub-post">should be gone</div>' \
            '</div><footer class="myfoot">at the bottom</footer></html> '
     div_ids = []
     self.extractor = PositionContentExtractor(div_ids=div_ids)
     self.extractor.process_html_page(html)
Пример #2
0
 def test_extract_from_bad_page(self):
     self.extractor = PositionContentExtractor(div_ids=self.div_ids)
     self.extractor.process_html_page(self.html)
     #todo pass if no errors?
     div_ids = ['related', 'skiplink-container']
     self.extractor.set_div_ids(div_ids)