def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures """ for source, annotations in iter_samples('pageparsing'): template = HtmlPage(body=source) parser = TemplatePageParser(TokenDict()) parser.feed(template) for annotation in parser.annotations: test_annotation = annotations.pop(0) for s in annotation.__slots__: if s == "tag_attributes": for pair in getattr(annotation, s): self.assertEqual(list(pair), test_annotation[s].pop(0)) else: self.assertEqual(getattr(annotation, s), test_annotation[s]) self.assertEqual(annotations, [])
def test_extraction(self): samples_encoding = 'latin1' [(html1, data1), (html2, data2)] = list(iter_samples( 'scraper_loadstore', html_encoding=samples_encoding)) sc = Scraper() page1 = HtmlPage(body=html1, encoding=samples_encoding) sc.train_from_htmlpage(page1, data1) page2 = HtmlPage(body=html2, encoding=samples_encoding) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2) # check still works after serialize/deserialize f = StringIO() sc.tofile(f) f.seek(0) sc = Scraper.fromfile(f) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2)
def test_site_samples(self): """test parse_html from real cases""" for i, (source, parsed) in enumerate( iter_samples('htmlpage', object_hook=_decode_element)): self._test_sample(source, parsed, i)