class Handler(gevent.Greenlet): logger = logging.getLogger("spider.handler") def __init__(self, urlobj, spider): gevent.Greenlet.__init__(self) self.urlobj = urlobj self.spider = spider self.charset = "utf-8" def _run(self): strategy = self.spider.strategy urltable = self.spider.urltable queue = self.spider.queue try: html = self.open(self.urlobj.url) except Exception, why: self.logger.debug("open '%s' failed,since : %s", self.urlobj, why) return self.stop() linkin = self.urlobj depth = linkin.depth + 1 if strategy.max_depth and (depth > strategy.max_depth): return self.stop() for link in self.feed(html): if urltable.full(): self.stop() self.spider.stop() return if link in urltable: continue if strategy.same_host and (not UrlFilter.isSameHost( link, linkin.url)): continue if strategy.same_domain and (not UrlFilter.isSameDomain( link, linkin.url)): continue url = UrlObj(link, depth, linkin) urltable.insert(url) queue.put(url) self.logger.debug("sucess crawled '%s' the <%d> urls", url, len(urltable)) self.stop()
def testSpiderStrategy(self): self.assertEqual(len(self.spider.urltable), 5000) self.assertLessEqual(self.spider.urltable.urls[-1].depth, 3) for url in self.spider.urltable.urls[100:200]: self.assert_(UrlFilter.isSameDomain(self.root, str(url)))
def testSameDomain(self): for url in self.spider.urltable.urls[100:200]: self.assert_(UrlFilter.isSameDomain(self.root, str(url)))