Пример #1
0
 def init_request(self):
     urls = self.task.get_exist_url()
     self.urlcount += len(urls)
     for url in urls:
         request = Request(url.url, url.method, url.params, url.referer)
         if self.visited[request] < self.duplicates:
             if not discard(request.url) and not url.end_time:
                 request.id = url.id
                 self.pendings.put(request)
                 DEBUG("-----request:%s not crawler,add queue" % request)
             self.visited[request] += 1
         else:
             #DEBUG("duplicates url:%s" %request)
             pass
     return self.urlcount     
Пример #2
0
 def init_request(self):
     urls = self.task.get_exist_url()
     self.urlcount += len(urls)
     for url in urls:
         request = Request(url.url, url.method, url.params, url.referer)
         if self.visited[request] < self.duplicates:
             if not discard(request.url) and not url.end_time:
                 request.id = url.id
                 self.pendings.put(request)
                 DEBUG("-----request:%s not crawler,add queue" % request)
             self.visited[request] += 1
         else:
             #DEBUG("duplicates url:%s" %request)
             pass
     return self.urlcount
Пример #3
0
 def addRequest(self, request):
     """
     0. judge discard or not,e.g. .css .png
     1. judge max depth
     2. judge whether duplicate
     3. judge max url count
     """
     if self.visited[request] < self.duplicates:
         self.judgeUrlCount()
         if not discard(request.url):
             request.id = pipeline(request)
             self.pendings.put(request)
             #DEBUG("--*--:%s" % request)
         else: #.png等url不放入队列
             pipeline(request)
         self.visited[request] += 1
     else:
         #DEBUG("duplicates url:%s" %request)
         pass        
Пример #4
0
 def addRequest(self, request):
     """
     0. judge discard or not,e.g. .css .png
     1. judge max depth
     2. judge whether duplicate
     3. judge max url count
     """
     if self.visited[request] < self.duplicates:
         self.judgeUrlCount()
         if not discard(request.url):
             request.id = pipeline(request)
             self.pendings.put(request)
             #DEBUG("--*--:%s" % request)
         else:  #.png等url不放入队列
             pipeline(request)
         self.visited[request] += 1
     else:
         #DEBUG("duplicates url:%s" %request)
         pass