예제 #1
0
 def dojob(self,t_name,function,job):
      myout.log('线程:'+str(t_name)+' 处理:'+str(job))
      function(job)
      self.lock.acquire()
      self.do_jobing.popleft()
      self.lock.release()
      myout.log('线程:'+str(t_name)+' 处理完毕:'+str(job))
예제 #2
0
파일: html.py 프로젝트: whizz1757/Spider
def main(url):
    #要对变量赋值的时候,要先声明使用的全局变量,不然会新建一个局部变量!
    #真是坑
    global web_domain
    web_domain=tools.get_domain(url)
    myout.log(web_domain)
    wait_url.append(url)
    pool=threadPoolManager.PoolManager(process,wait_url)
예제 #3
0
    def __init__(self,function,job):
        myout.log('启动线程池')
        self.lock=threading.RLock()

        workers=range(100)
        self.do_jobing=queue.deque()
        for worker in workers:
            worker_man=threading.Thread(target=self.handle_job,args=(worker,function,job))
            worker_man.start()
예제 #4
0
파일: html.py 프로젝트: whizz1757/Spider
def process(now_url):
    # 抓取该url中的网页,并获取网页中url
    #myout.log("抓取:"+now_url)
    links=tools.get_links(now_url,web_domain)

    # 处理新的url,判断网页中的url是否已经抓取过
    for link in links:
        if(link not in get_url):
            #如果没有爬到过url,则放入finish_url中,并进行爬取
            myout.log("加入:"+link)
            get_url.add(link)
            wait_url.append(link)
예제 #5
0
    def handle_job(self,t_name,function,job):
        run=1
        while run>0:
            # 如果有工作,那么就取出工作,干活就是了
            job_status=self.get_job(job)
            if job_status['status']==0:
                # 退出
                run=0
            elif job_status['status']==1:
                self.dojob(t_name,function,job_status['job'])
            elif job_status['status']==2:
                time.sleep(1)

        myout.log('线程:'+str(t_name)+'退出')