예제 #1
0
파일: page.py 프로젝트: pombredanne/krauler
    def parse(self):
        tags = [('a', 'href'), ('img', 'src'), ('link', 'href'),
                ('iframe', 'src')]

        # TODO: check rel="canonical"
        urls = set([])
        for tag_name, attr_name in tags:
            for tag in self.doc.findall('.//%s' % tag_name):
                attr = tag.get(attr_name)
                if attr is None:
                    continue
                url = normalize_url(urljoin(self.url, attr))
                if url is not None:
                    urls.add(url)

        on_parse.send(self, urls=urls)

        for url in urls:
            self.state.crawl(url, path=self.next_path)
예제 #2
0
파일: page.py 프로젝트: pombredanne/krauler
 def url(self):
     url = self.raw_url
     if self._has_response():
         url = self._response.url
         url = normalize_url(url)
     return url
예제 #3
0
 def seeds(self):
     if not hasattr(self, '_seeds'):
         seeds = [normalize_url(s) for s in self.get_list('seed')]
         self._seeds = [s for s in seeds if s is not None]
     return self._seeds