def callbackfunc(request,result): res,resource,pagebuf = result if pagebuf == None : return hreflist = fetchPage.parsePage(pagebuf, resource) for href in hreflist : if PAGESCache.get(href,None) == None : PAGESCache[href] = True else : continue hostname,filename = fetchPage.parse(href) main.putRequest(threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc)) fetchPage.dealwithResult(res,resource)
def usingOneThread(limit): urlset = open("input.txt","r") start = datetime.datetime.now() for u in urlset: if limit <= 0 : break limit-=1 hostname , filename = fetchPage.parse(u) res= fetchPage.downPage(hostname,filename,0) fetchPage.dealwithResult(res) end = datetime.datetime.now() print "Start at :\t" , start print "End at :\t" , end print "Total Cost :\t" , end - start print 'Total fetched :', statistics.fetched_url
def usingThreadpool(limit,num_thread): urlset = open("input.txt","r") start = datetime.datetime.now() main = threadpool.ThreadPool(num_thread) for url in urlset : try : hostname , filename = fetchPage.parse(url) req = threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc) main.putRequest(req) except Exception: print Exception.message while True: try: main.poll() if statistics.total_url >= limit : break except threadpool.NoResultsPending: print "no pending results" break except Exception ,e: print e
def usingThreadpool(limit,num_thread): urlset = open(config.SEED_FILE, "r") start = datetime.datetime.now() for url in urlset : try : if PAGESCache.get(url,None) == None : PAGESCache[url] = True else : continue hostname , filename = fetchPage.parse(url) req = threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc) main.putRequest(req) except Exception: pass while True: try: main.poll() if config.total_url >= limit : break except threadpool.NoResultsPending: print "no pending results" break except Exception ,e: pass
def usingThreadpool(limit,num_thread): urlset = open(config.SEED_FILE, "r") start = datetime.datetime.now() for url in urlset : try : #ページキャッシュがああるかをみている if PAGESCache.get(url,None) == None : PAGESCache[url] = True else : continue hostname , filename = fetchPage.parse(url) req = threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc) main.putRequest(req) except Exception: pass while True:# ここがメインの処理 try: main.poll()#polling してリミット以上になったらブレイク if config.total_url >= limit : break except threadpool.NoResultsPending: print "no pending results" break except Exception ,e: pass