예제 #1
0
 def run(self):
     while self.alive:
         time.sleep(1)
         walk_url = TaskQueue.getInstance().get()
         if not walk_url:
             self.emptycount = self.emptycount - 1
             if self.emptycount == 0 :
                 return
             continue
         self.emptycount = self.initempty
         _level = walk_url[1]
         _url = walk_url[0]
         if _level > self.level:
             continue 
         if UrlPool.getInstance().exist(_url):
             continue
         UrlPool.getInstance().put_url(_url)
         try:
             html = self.down_load_html(_url,coding="gb2312")
         except Exception,e:
             print e
             continue   
         proxydata = self.proxy_parser(html)
         _level = _level + 1 
         if len(proxydata):
             ProxyData.getInstance().put(proxydata)
             _level = 0 
         link_list = self.get_link(html,_level,_url)
         for  link in link_list:
             if not UrlPool.getInstance().exist(link[0]):
                 TaskQueue.getInstance().put(link)
예제 #2
0
 def run(self):
     while self.alive:
         time.sleep(1)
         walk_url = TaskQueue.getInstance().get()
         if not walk_url:
             self.emptycount = self.emptycount - 1
             if self.emptycount == 0:
                 return
             continue
         self.emptycount = self.initempty
         _level = walk_url[1]
         _url = walk_url[0]
         if _level > self.level:
             continue
         if UrlPool.getInstance().exist(_url):
             continue
         UrlPool.getInstance().put_url(_url)
         try:
             html = self.down_load_html(_url, coding="gb2312")
         except Exception, e:
             print e
             continue
         proxydata = self.proxy_parser(html)
         _level = _level + 1
         if len(proxydata):
             ProxyData.getInstance().put(proxydata)
             _level = 0
         link_list = self.get_link(html, _level, _url)
         for link in link_list:
             if not UrlPool.getInstance().exist(link[0]):
                 TaskQueue.getInstance().put(link)
예제 #3
0
 def __init__(self , keyword = []):
     self.keyword.extend(keyword)
     if not isinstance(keyword, list):
         raise TypeError("KEY_EORD_MUST_BE_LIST")
     for _kw in keyword:
         searchword = self.keyword.pop()
         url = self.baidu_search(searchword)
         if not UrlPool.getInstance().exist(url):
             TaskQueue.getInstance().put((url,-10))