def extract(page): url = page.content_url content = Content(url=url, source=NATIVE) logging.info("fetching %r with native extractor" % (url,)) body = page.raw_content try: soup = page_parser.parse(body, base_href=page.base_href, notify=logging.info) content.body = page_parser.get_body(soup) content.title = page_parser.get_title(soup) except StandardError, e: raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
def assertParses(self, input, expected): output = page_parser.parse(input, "dontcare") self.assertEquals(unicode(output), expected)