示例#1
0
def callbackfunc(request,result):
    res,resource,pagebuf = result
    if pagebuf == None :
        return
    
    hreflist = fetchPage.parsePage(pagebuf, resource)
    for href in hreflist :
        if PAGESCache.get(href,None) == None : PAGESCache[href] = True 
        else : continue
        hostname,filename = fetchPage.parse(href)
        main.putRequest(threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc))
    fetchPage.dealwithResult(res,resource)
示例#2
0
文件: run.py 项目: qz2501/miniCrowler
def callbackfunc(request,result):
    res,resource,pagebuf = result
    if pagebuf == None :
        return
    
    hreflist = fetchPage.parsePage(pagebuf, resource)
    for href in hreflist :
        if PAGESCache.get(href,None) == None : PAGESCache[href] = True 
        else : continue
        hostname,filename = fetchPage.parse(href)
        main.putRequest(threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc))
    fetchPage.dealwithResult(res,resource)
示例#3
0
def usingOneThread(limit):
    urlset = open("input.txt","r")
    start = datetime.datetime.now()
    for u in urlset:
        if limit <= 0 : break
        limit-=1
        hostname , filename = fetchPage.parse(u)
        res= fetchPage.downPage(hostname,filename,0)
        fetchPage.dealwithResult(res)
    end = datetime.datetime.now()
    print "Start at :\t" , start
    print "End at :\t" , end
    print "Total Cost :\t" , end - start
    print 'Total fetched :', statistics.fetched_url
示例#4
0
def usingThreadpool(limit,num_thread):
    urlset = open("input.txt","r")
    start = datetime.datetime.now()
    main = threadpool.ThreadPool(num_thread)
    for url in urlset :
        try :
            hostname , filename = fetchPage.parse(url)
            req = threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc)
            main.putRequest(req)
        except Exception:
            print Exception.message        
    while True:
        try:
            main.poll()
            if statistics.total_url >= limit : break
        except threadpool.NoResultsPending:
            print "no pending results"
            break
        except Exception ,e:
            print e
示例#5
0
def usingThreadpool(limit,num_thread):
    urlset = open(config.SEED_FILE, "r")
    start = datetime.datetime.now()
    for url in urlset :
        try :
            if PAGESCache.get(url,None) == None : PAGESCache[url] = True 
            else : continue
            hostname , filename = fetchPage.parse(url)
            req = threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc)
            main.putRequest(req)
        except Exception:
            pass
    while True:
        try:
            main.poll()
            if config.total_url >= limit : break
        except threadpool.NoResultsPending:
            print "no pending results"
            break
        except Exception ,e:
            pass
示例#6
0
文件: run.py 项目: qz2501/miniCrowler
def usingThreadpool(limit,num_thread):
    urlset = open(config.SEED_FILE, "r")
    start = datetime.datetime.now()
    for url in urlset :
        try : 
            #ページキャッシュがああるかをみている
            if PAGESCache.get(url,None) == None : PAGESCache[url] = True 
            else : continue
            hostname , filename = fetchPage.parse(url)
            req = threadpool.WorkRequest(fetchPage.downPage,args=[hostname,filename],kwds={},callback=callbackfunc)
            main.putRequest(req)
        except Exception:
            pass
    while True:# ここがメインの処理
        try:
            main.poll()#polling してリミット以上になったらブレイク
            if config.total_url >= limit : break
        except threadpool.NoResultsPending:
            print "no pending results"
            break
        except Exception ,e:
            pass