def parse_link(self,base_url,html): soup = BeautifulSoup(html) self.pipe.process(self,base_url,soup) depth = base_url.depth + 1 for ref in soup.findAll(self.rule,href=True): url = urlrule.get_abs_url(base_url.url,ref["href"]) if urlrule.match(url): self.queue.add_link(Link(url,depth))
def test_get_abs_url_can_return_abs_url(self): base = "http://www.google.com" rel_url = "/test.html" abs_url = "http://www.google.com/test.html" self.assertEqual(urlrule.get_abs_url(base,rel_url),abs_url)