def recurse(urlList,path,deepth=-1): if not urlList : return None if deepth>0: deepth -= 1 elif deepth == 0: return None for url in urlList: content = getHtml(url) urlList = getUrl(content) tmpUrlList = filter(urlList) print "flash disk starting.... >>> file : "+path flashFile(urlSpidered.keys(),path) print "flash disk over ! file : "+path countTimes.countTime() if tmpUrlList : tmpUrlList = sureDomain(tmpUrlList,domain) recurse(tmpUrlList,path,deepth) print "*"*66 print json.dumps(urlSpidered.keys())
def recurse(urlList, path, deepth=-1): if not urlList: return None if deepth > 0: deepth -= 1 elif deepth == 0: return None for url in urlList: content = getHtml(url) urlList = getUrl(content) tmpUrlList = filter(urlList) print "flash disk starting.... >>> file : " + path flashFile(urlSpidered.keys(), path) print "flash disk over ! file : " + path countTimes.countTime() if tmpUrlList: tmpUrlList = sureDomain(tmpUrlList, domain) recurse(tmpUrlList, path, deepth) print "*" * 66 print json.dumps(urlSpidered.keys())
content = getHtml(url) urlList = getUrl(content) tmpUrlList = filter(urlList) print "flash disk starting.... >>> file : "+path flashFile(urlSpidered.keys(),path) print "flash disk over ! file : "+path countTimes.countTime() if tmpUrlList : tmpUrlList = sureDomain(tmpUrlList,domain) recurse(tmpUrlList,path,deepth) print "*"*66 print json.dumps(urlSpidered.keys()) if __name__ == '__main__': print "+"*88 print "spider start working " countTimes.printCurrentTime() director = '../data' if not os.path.exists(director): os.makedirs(director) path = director+'/url.json' recurse([domain],path,3) print "+"*88 print "spider worked over !! " countTimes.countTime() countTimes.printCurrentTime()
#coding:utf8 import spiderTool as spider import countTimes import os __author__ = 'flybird1971' #抓取网站域名 domain = 'http://www.guonainai.com' print "+" * 88 print "spider start working " countTimes.printCurrentTime() director = '../data' if not os.path.exists(director): os.makedirs(director) path = director + '/url.json' spider.recurse([domain], path) print "+" * 88 print "spider worked over !! " countTimes.countTime() countTimes.printCurrentTime()