예제 #1
0
 def start_a(self, tag, attrs):
   attrs = dict(attrs)
   if 'href' in attrs:
     url = self.crawler.accept_url(urljoin(self.base_href, wash_url(attrs['href'])))
     if url:
       self.crawler.inject_url(url)
       self.anchor_href = url
       self.anchor_text = []
   return
예제 #2
0
 def start_base(self, tag, attrs):
   attrs = dict(attrs)
   if 'href' in attrs:
     self.base_href = urljoin(self.base_href, wash_url(attrs['href']))
   return