def dojob(self,t_name,function,job): myout.log('线程:'+str(t_name)+' 处理:'+str(job)) function(job) self.lock.acquire() self.do_jobing.popleft() self.lock.release() myout.log('线程:'+str(t_name)+' 处理完毕:'+str(job))
def main(url): #要对变量赋值的时候,要先声明使用的全局变量,不然会新建一个局部变量! #真是坑 global web_domain web_domain=tools.get_domain(url) myout.log(web_domain) wait_url.append(url) pool=threadPoolManager.PoolManager(process,wait_url)
def __init__(self,function,job): myout.log('启动线程池') self.lock=threading.RLock() workers=range(100) self.do_jobing=queue.deque() for worker in workers: worker_man=threading.Thread(target=self.handle_job,args=(worker,function,job)) worker_man.start()
def process(now_url): # 抓取该url中的网页,并获取网页中url #myout.log("抓取:"+now_url) links=tools.get_links(now_url,web_domain) # 处理新的url,判断网页中的url是否已经抓取过 for link in links: if(link not in get_url): #如果没有爬到过url,则放入finish_url中,并进行爬取 myout.log("加入:"+link) get_url.add(link) wait_url.append(link)
def handle_job(self,t_name,function,job): run=1 while run>0: # 如果有工作,那么就取出工作,干活就是了 job_status=self.get_job(job) if job_status['status']==0: # 退出 run=0 elif job_status['status']==1: self.dojob(t_name,function,job_status['job']) elif job_status['status']==2: time.sleep(1) myout.log('线程:'+str(t_name)+'退出')