def run(self): while self.alive: time.sleep(1) walk_url = TaskQueue.getInstance().get() if not walk_url: self.emptycount = self.emptycount - 1 if self.emptycount == 0 : return continue self.emptycount = self.initempty _level = walk_url[1] _url = walk_url[0] if _level > self.level: continue if UrlPool.getInstance().exist(_url): continue UrlPool.getInstance().put_url(_url) try: html = self.down_load_html(_url,coding="gb2312") except Exception,e: print e continue proxydata = self.proxy_parser(html) _level = _level + 1 if len(proxydata): ProxyData.getInstance().put(proxydata) _level = 0 link_list = self.get_link(html,_level,_url) for link in link_list: if not UrlPool.getInstance().exist(link[0]): TaskQueue.getInstance().put(link)
def run(self): while self.alive: time.sleep(1) walk_url = TaskQueue.getInstance().get() if not walk_url: self.emptycount = self.emptycount - 1 if self.emptycount == 0: return continue self.emptycount = self.initempty _level = walk_url[1] _url = walk_url[0] if _level > self.level: continue if UrlPool.getInstance().exist(_url): continue UrlPool.getInstance().put_url(_url) try: html = self.down_load_html(_url, coding="gb2312") except Exception, e: print e continue proxydata = self.proxy_parser(html) _level = _level + 1 if len(proxydata): ProxyData.getInstance().put(proxydata) _level = 0 link_list = self.get_link(html, _level, _url) for link in link_list: if not UrlPool.getInstance().exist(link[0]): TaskQueue.getInstance().put(link)
def __init__(self , keyword = []): self.keyword.extend(keyword) if not isinstance(keyword, list): raise TypeError("KEY_EORD_MUST_BE_LIST") for _kw in keyword: searchword = self.keyword.pop() url = self.baidu_search(searchword) if not UrlPool.getInstance().exist(url): TaskQueue.getInstance().put((url,-10))