Пример #1
0
def extract(page):
    url = page.content_url
    content = Content(url=url, source=NATIVE)
    logging.info("fetching %r with native extractor" % (url,))
    body = page.raw_content
    try:
        soup = page_parser.parse(body, base_href=page.base_href, notify=logging.info)
        content.body = page_parser.get_body(soup)
        content.title = page_parser.get_title(soup)
    except StandardError, e:
        raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
Пример #2
0
 def assertParses(self, input, expected):
     output = page_parser.parse(input, "dontcare")
     self.assertEquals(unicode(output), expected)