示例#1
0
class Handler(gevent.Greenlet):

    logger = logging.getLogger("spider.handler")

    def __init__(self, urlobj, spider):
        gevent.Greenlet.__init__(self)
        self.urlobj = urlobj
        self.spider = spider
        self.charset = "utf-8"

    def _run(self):
        strategy = self.spider.strategy
        urltable = self.spider.urltable
        queue = self.spider.queue

        try:
            html = self.open(self.urlobj.url)
        except Exception, why:
            self.logger.debug("open '%s' failed,since : %s", self.urlobj, why)
            return self.stop()

        linkin = self.urlobj
        depth = linkin.depth + 1

        if strategy.max_depth and (depth > strategy.max_depth):
            return self.stop()

        for link in self.feed(html):

            if urltable.full():
                self.stop()
                self.spider.stop()
                return

            if link in urltable:
                continue

            if strategy.same_host and (not UrlFilter.isSameHost(
                    link, linkin.url)):
                continue

            if strategy.same_domain and (not UrlFilter.isSameDomain(
                    link, linkin.url)):
                continue

            url = UrlObj(link, depth, linkin)
            urltable.insert(url)
            queue.put(url)

            self.logger.debug("sucess crawled '%s' the <%d> urls", url,
                              len(urltable))

        self.stop()
示例#2
0
 def testSpiderStrategy(self):
     self.assertEqual(len(self.spider.urltable), 5000)
     self.assertLessEqual(self.spider.urltable.urls[-1].depth, 3)
     for url in self.spider.urltable.urls[100:200]:
         self.assert_(UrlFilter.isSameDomain(self.root, str(url)))
示例#3
0
 def testSameDomain(self):
     for url in self.spider.urltable.urls[100:200]:
         self.assert_(UrlFilter.isSameDomain(self.root, str(url)))
示例#4
0
 def testSameDomain(self):
     for url in self.spider.urltable.urls[100:200]:
         self.assert_(UrlFilter.isSameDomain(self.root, str(url)))