Exemplo n.º 1
0
	def __init__(self, args=Strategy()):
		self.url = args.url 				
		self.max_depth = args.max_depth  	#指定网页深度
		self.max_count = args.max_count		#爬行最大数量
		self.concurrency = args.concurrency	#线程数
		self.timeout = args.timeout			#超时
		self.cookies = args.cookies 		#cookies
		self.ssl_verify = args.ssl_verify 	#ssl
		self.same_host = args.same_host		#是否只抓取相同host的链接
		self.same_domain = args.same_domain	#是否只抓取相同domain的链接

		self.currentDepth = 1  				#标注初始爬虫深度,从1开始
		self.keyword = args.keyword		 	#指定关键词,使用console的默认编码来解码
		

		self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数
		
		self.visitedHrefs = set()   		#已访问的链接
		self.unvisitedHrefs = deque()		#待访问的链接 
		self.unvisitedHrefs.append(args.url)#添加首个待访问的链接
		self.isCrawling = False				#标记爬虫是否开始执行任务

		self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
		print self.file
		print 'args.url=\t',args.url

		#################
		#此句有问题
		self.database =  Database(args.dbFile)			#数据库
		# print 'hehe'

		self.lock = Lock()
Exemplo n.º 2
0
    def __init__(self, args=Strategy()):
        self.url = args.url
        self.max_depth = args.max_depth  #指定网页深度
        self.max_count = args.max_count  #爬行最大数量
        self.concurrency = args.concurrency  #线程数
        self.timeout = args.timeout  #超时
        self.cookies = args.cookies  #cookies
        self.ssl_verify = args.ssl_verify  #ssl
        self.same_host = args.same_host  #是否只抓取相同host的链接
        self.same_domain = args.same_domain  #是否只抓取相同domain的链接

        self.currentDepth = 1  #标注初始爬虫深度,从1开始
        self.keyword = args.keyword  #指定关键词,使用console的默认编码来解码

        self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数

        self.visitedHrefs = set()  #已访问的链接
        self.unvisitedHrefs = deque()  #待访问的链接
        self.unvisitedHrefs.append(args.url)  #添加首个待访问的链接
        self.isCrawling = False  #标记爬虫是否开始执行任务

        self.file = BASEDIR + '/cache/crawler/' + genFilename(
            self.url) + '.txt'
        # print self.file
        # print 'args.url=\t',args.url

        #################
        #此句有问题
        self.database = Database(args.dbFile)  #数据库
        # print 'hehe'

        self.lock = Lock()
Exemplo n.º 3
0
 def __init__(self, filename='', url=None):
     super(CrawlerFile, self).__init__()
     self.file = filename
     self.url = url
     if url:
         self.file = BASEDIR + '/cache/crawler/' + genFilename(url) + '.txt'
Exemplo n.º 4
0
	def __init__(self,filename='',url=None):
		super(CrawlerFile, self).__init__()
		self.file = filename
		self.url = url
		if url:
			self.file = BASEDIR + '/cache/crawler/' + genFilename(url) + '.txt'