Пример #1
0
def recurse(urlList,path,deepth=-1):
    if not urlList :
        return None
    if deepth>0:
        deepth -= 1
    elif deepth == 0:
        return None

    for url in urlList:
        content = getHtml(url)
        urlList = getUrl(content)
        tmpUrlList = filter(urlList)
        print "flash disk starting.... >>>  file : "+path
        flashFile(urlSpidered.keys(),path)
        print "flash disk over !  file : "+path
        countTimes.countTime()
        if tmpUrlList :
            tmpUrlList = sureDomain(tmpUrlList,domain)
            recurse(tmpUrlList,path,deepth)
    print "*"*66
    print json.dumps(urlSpidered.keys())
Пример #2
0
def recurse(urlList, path, deepth=-1):
    if not urlList:
        return None
    if deepth > 0:
        deepth -= 1
    elif deepth == 0:
        return None

    for url in urlList:
        content = getHtml(url)
        urlList = getUrl(content)
        tmpUrlList = filter(urlList)
        print "flash disk starting.... >>>  file : " + path
        flashFile(urlSpidered.keys(), path)
        print "flash disk over !  file : " + path
        countTimes.countTime()
        if tmpUrlList:
            tmpUrlList = sureDomain(tmpUrlList, domain)
            recurse(tmpUrlList, path, deepth)
    print "*" * 66
    print json.dumps(urlSpidered.keys())
Пример #3
0
        content = getHtml(url)
        urlList = getUrl(content)
        tmpUrlList = filter(urlList)
        print "flash disk starting.... >>>  file : "+path
        flashFile(urlSpidered.keys(),path)
        print "flash disk over !  file : "+path
        countTimes.countTime()
        if tmpUrlList :
            tmpUrlList = sureDomain(tmpUrlList,domain)
            recurse(tmpUrlList,path,deepth)
    print "*"*66
    print json.dumps(urlSpidered.keys())

if __name__ == '__main__':

    print "+"*88
    print "spider start working  "
    countTimes.printCurrentTime()
    director = '../data'
    if not os.path.exists(director):
        os.makedirs(director)
    path = director+'/url.json'
    recurse([domain],path,3)
    print "+"*88
    print "spider worked over !! "
    countTimes.countTime()
    countTimes.printCurrentTime()



Пример #4
0
#coding:utf8
import spiderTool as spider
import countTimes
import os
__author__ = 'flybird1971'

#抓取网站域名
domain = 'http://www.guonainai.com'

print "+" * 88
print "spider start working  "
countTimes.printCurrentTime()
director = '../data'
if not os.path.exists(director):
    os.makedirs(director)
path = director + '/url.json'
spider.recurse([domain], path)
print "+" * 88
print "spider worked over !! "
countTimes.countTime()
countTimes.printCurrentTime()