예제 #1
0
 def __init__(self, url, depth, model, thread, keyword=None):
     self.originalUrl = url
     #print self.originalUrl
     self.depth = depth
     self.model = model
     self.threadNum = thread
     self.keyword = str(keyword)
     self.currentDepth = 0
     self.crawState = False
     self.threadPool = ThreadPool(self.threadNum)
     self.visitedUrls = set()
     self.unvisitedUrls = deque()
     self.unvisitedUrls.append(url)
     self.visitedUrls = []
예제 #2
0
파일: crawler.py 프로젝트: 360sec/Crawler
class Crawler(object):
    def __init__(self,url,depth,model,thread,keyword=None):
        self.originalUrl=url
        #print self.originalUrl
        self.depth=depth
        self.model=model
        self.threadNum=thread
        self.keyword=str(keyword)
        self.currentDepth=0
        self.crawState=False
        self.threadPool=ThreadPool(self.threadNum)
        self.visitedUrls=set()
        self.unvisitedUrls=deque()
        self.unvisitedUrls.append(url)
        self.visitedUrls=[]
    def start(self):
        print "[START]Perhaps wait .......\n"
        self.crawState=True
        self.threadPool.startThreads()
        while self.currentDepth<=self.depth:
            
            self.assignCurrentDepthTasks()
            while self.threadPool.getTaskLeft():
                time.sleep(5)
            self.currentDepth+=1
        self.stop()
    def assignCurrentDepthTasks(self):
        while self.unvisitedUrls:
            url=self.unvisitedUrls.popleft()
            self.threadPool.putTask(self.task_handler,url)
            self.visitedUrls.append(url)
        print 'Depth %d Finish.Totally visited %d links\n' %(self.currentDepth,len(self.visitedUrls))
    def stop(self):
        self.crawState=False
        self.threadPool.stopThreads()
    def task_handler(self,url):
        downloadpage=DownLoadPage(url)
        if downloadpage.downloadpage(self.model):#default downloadpagemodel is static
            page,url=downloadpage.returnpage()
            getlinks=GetLinks(page,url)
            links=getlinks.getLinks(self.originalUrl)
            for link in links:
                self.outputData(link)
                self.unvisitedUrls.append(link)
    def outputData(self,url):#动态链接;子域名;keyword等过滤条件
        if self.keyword=='dynamic':
            if '?' in url:
                with open('dict.txt','a+') as file:
                    file.write(url+'\n')
                print url
        elif self.keyword=='subdomain':
            url=url.split('/')[2]#TODO filter domain
            print '[INFO]',url
        else:
            print '[INFO]',url
예제 #3
0
파일: crawler.py 프로젝트: 360sec/Crawler
 def __init__(self,url,depth,model,thread,keyword=None):
     self.originalUrl=url
     #print self.originalUrl
     self.depth=depth
     self.model=model
     self.threadNum=thread
     self.keyword=str(keyword)
     self.currentDepth=0
     self.crawState=False
     self.threadPool=ThreadPool(self.threadNum)
     self.visitedUrls=set()
     self.unvisitedUrls=deque()
     self.unvisitedUrls.append(url)
     self.visitedUrls=[]
예제 #4
0
class Crawler(object):
    def __init__(self, url, depth, model, thread, keyword=None):
        self.originalUrl = url
        #print self.originalUrl
        self.depth = depth
        self.model = model
        self.threadNum = thread
        self.keyword = str(keyword)
        self.currentDepth = 0
        self.crawState = False
        self.threadPool = ThreadPool(self.threadNum)
        self.visitedUrls = set()
        self.unvisitedUrls = deque()
        self.unvisitedUrls.append(url)
        self.visitedUrls = []

    def start(self):
        print "[START]Perhaps wait .......\n"
        self.crawState = True
        self.threadPool.startThreads()
        while self.currentDepth <= self.depth:

            self.assignCurrentDepthTasks()
            while self.threadPool.getTaskLeft():
                time.sleep(5)
            self.currentDepth += 1
        self.stop()

    def assignCurrentDepthTasks(self):
        while self.unvisitedUrls:
            url = self.unvisitedUrls.popleft()
            self.threadPool.putTask(self.task_handler, url)
            self.visitedUrls.append(url)
        print 'Depth %d Finish.Totally visited %d links\n' % (
            self.currentDepth, len(self.visitedUrls))

    def stop(self):
        self.crawState = False
        self.threadPool.stopThreads()

    def task_handler(self, url):
        downloadpage = DownLoadPage(url)
        if downloadpage.downloadpage(
                self.model):  #default downloadpagemodel is static
            page, url = downloadpage.returnpage()
            getlinks = GetLinks(page, url)
            links = getlinks.getLinks(self.originalUrl)
            for link in links:
                self.outputData(link)
                self.unvisitedUrls.append(link)

    def outputData(self, url):  #动态链接;子域名;keyword等过滤条件
        if self.keyword == 'dynamic':
            if '?' in url:
                with open('dict.txt', 'a+') as file:
                    file.write(url + '\n')
                print url
        elif self.keyword == 'subdomain':
            url = url.split('/')[2]  #TODO filter domain
            print '[INFO]', url
        else:
            print '[INFO]', url