Пример #1
0
	def _process_page(self, item, spider):
		cnt = {'filte':0,'enqueue':0,'refuse':len(item['refused_links']),'other_link':0}
		# 记录爬取过此网址
		self.server.sadd(spider.URL_VISITED_KEY, item['url'])
		
		domain, ret_type = domain_getter.get_domain(item['url'])
		# 记录域名的bloom filter中未出现过此域名
		if not self.bloom_domain_vec.add(domain):
			self._initRelateDomain(domain, spider)
			# redis数据库总表中不存在这个域名
			if not self.server.exists(domains_key%domain):
				self._initGlobalDomain(domain)

		for link in item['links']:

			link_domain, ret_type = domain_getter.get_domain(link)
			# 判断域名是否和当前爬取域名D0相同
			if link_domain == spider.CRAWLING_DOMAIN:
				# D0的size+1
				self.server.hincrby(domains_key%link_domain, 'size', 1)
				self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,domain), 'size', 1)
				# 判断其netloc是否出现过
				netloc = urlparse(link).netloc
				if not self.bloom_netloc_vec.add(netloc):
					if self._isBlogLink(link):
						# 在过滤名单内,直接跳过
						self.server.sadd(spider.BLOG_IGNORE_KEY, link)
						cnt['filte'] += 1
					else:
						# 未出现过,网页进入队列
						self.server.rpush(spider.URL_QUEUE_KEY, "http://"+netloc)
						cnt['enqueue'] += 1
			else:# 不同,则判断域名是否已经出现过
				cnt['other_link'] += 1
				if self.bloom_domain_vec.add(link_domain):
					# 已出现过,对应的记录D1入度+1
					self.server.hincrby(domains_key%link_domain, 'indegree', 1)
					self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'indegree', 1)
				else:
					# 没出现过,则Domain中增加记录D1,D1记录入度初始化为1,出度初始化为0;D0出度+1
					if not self.server.exists(domains_key%domain):
						self._initGlobalDomain(link_domain)
					self.server.hincrby(domains_key%domain, 'outdegree', 1)

					self._initRelateDomain(link_domain, spider)
					self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'outdegree', 1)

		for link in item['refused_links']:
			spider.log(link+" refuse by robots.txt", level=log.INFO)
			self.server.sadd(spider.ROBOT_REFUSED_KEY, link)

		spider.log('%s from %s'%(str(cnt), item['url']), level=log.INFO)
Пример #2
0
	def _process_page(self, item, spider):
		# 记录爬取过此网址
		self.server.rpush(spider.URL_VISITED_KEY, item['url'])
		
		domain, ret_type = domain_getter.get_domain(item['url'])
		if not self.bloom_domain_vec.add(domain):
			self._initRelateDomain(domain, spider)
			if not self.server.exists(domains_key%domain):
				self._initGlobalDomain(domain)

		for link in item['links']:

			if self._isBlogLink(link):
				self.server.rpush(spider.BLOG_IGNORE_KEY, link)
				continue

			link_domain, ret_type = domain_getter.get_domain(link)
			# 判断域名是否和当前爬取域名D0相同
			if link_domain == spider.CRAWLING_DOMAIN:
				# D0的size+1
				self.server.hincrby(domains_key%link_domain, 'size', 1)
				self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,domain), 'size', 1)
				# 判断其netloc是否出现过
				netloc = urlparse(link).netloc
				if not self.bloom_netloc_vec.add(netloc):
					# 未出现过,网页进入队列
					self.server.rpush(spider.URL_QUEUE_KEY, "http://"+netloc)
			else:# 不同,则判断域名是否已经出现过
				if self.bloom_domain_vec.add(link_domain):
					# 已出现过,对应的记录D1入度+1
					self.server.hincrby(domains_key%link_domain, 'indegree', 1)
					self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'indegree', 1)
				else:
					# 没出现过,则Domain中增加记录D1,D1记录入度初始化为1,出度初始化为0;D0出度+1
					if not self.server.exists(domains_key%domain):
						self._initGlobalDomain(link_domain)
					self.server.hincrby(domains_key%domain, 'outdegree', 1)

					self._initRelateDomain(link_domain, spider)
					self.server.hincrby(related_domains_key%(spider.CRAWLING_DOMAIN,link_domain), 'outdegree', 1)

		for link in item['refused_links']:
			spider.log(link+" refuse by robots.txt", level=log.INFO)
			self.server.rpush(spider.ROBOT_REFUSED_KEY, link)
Пример #3
0
	def __init__(self, begin="http://www.163.com"):
		super(WebModelSpider, self).__init__()
		print 'initing spider with begin:',begin
		# 根据begin参数修改redis数据库中存储键值
		self.CRAWLING_DOMAIN = domain_getter.get_domain(begin)[0]
		self.URL_QUEUE_KEY = url_queue_key%self.CRAWLING_DOMAIN
		self.URL_VISITED_KEY = url_visited_key%self.CRAWLING_DOMAIN
		self.BLOG_IGNORE_KEY = url_ignore_key%self.CRAWLING_DOMAIN
		self.ROBOT_REFUSED_KEY = robots_refused_key%self.CRAWLING_DOMAIN
		
		self.BEGIN_URL = begin
Пример #4
0
    def __init__(self, begin="http://www.163.com"):
        super(WebModelSpider, self).__init__()
        print 'initing spider with begin:', begin
        # 根据begin参数修改redis数据库中存储键值
        self.CRAWLING_DOMAIN = domain_getter.get_domain(begin)[0]
        self.URL_QUEUE_KEY = url_queue_key % self.CRAWLING_DOMAIN
        self.URL_VISITED_KEY = url_visited_key % self.CRAWLING_DOMAIN
        self.BLOG_IGNORE_KEY = url_ignore_key % self.CRAWLING_DOMAIN
        self.ROBOT_REFUSED_KEY = robots_refused_key % self.CRAWLING_DOMAIN

        self.BEGIN_URL = begin
Пример #5
0
	def parse(self, response):
		if response.status != 200:
			self.log("Response code: %d from %s"%(response.status, response['url']), level=log.WARNING)
			return
		# 解析url,获取域名
		netloc = urlparse(response.url).netloc
		domain, ret_type = domain_getter.get_domain(response.url)

		# 检查并获取domain对应的robots内容
		# robotsparser用以判断url是否被robots.txt内规则允许
		robotsparser, rulesetItem = self.getRuleset(domain)
		# 把rulesetItem送到pipeline
		yield rulesetItem

		#IP地址
		if ret_type == TYPE_IP:
			return
			
		pageItem = PageItem()
		# 使用xpath获取内容
		pageItem['url'] = response.url
		hxs = Selector(text=response.body)
		try:
			pageItem['title'] = hxs.xpath('/html/head/title/text()').extract()[0]
		except IndexError:
			pageItem['title'] = '(crwal title failed)'
		pageItem['links'] = []
		pageItem['refused_links'] = []

		# 把所有<a>标签的超链接提取出来
		for link in hxs.xpath('//a/@href').extract() :
			# 判断超链接内容是否为一个合法网址/相对网址
			if link.startswith('http://') :
				pageItem['links'].append(link)
				# robots判定
				if not robotsparser or robotsparser.is_allowed('*', link):
					pageItem['links'].append(link)

					#Schedule从redis队列中获取, 不在这里发起请求
					# yield Request(link, callback=self.parse)
				else:
					pageItem['refused_links'].append(link)

			elif link.startswith('/') :
				# 相对定位,把网址补全,再yield请求
				link = "http://"+ netloc + link
				pageItem['links'].append(link)
				# robots判定
				if not robotsparser or robotsparser.is_allowed('*', link) :
					pageItem['links'].append(link)

					#Schedule从redis队列中获取, 不在这里发起请求
					# yield Request(link, callback=self.parse)
				else:
					pageItem['refused_links'].append(link)
			else :
				# 不符合以上两种,一般为javascript函数
				# msg = '%s : not a url'%(link,)
				# self.log(msg, level=log.DEBUG)
				pass

		msg = 'crwal %d links from %s.'%(
			len(pageItem['links']), pageItem['url'])
		self.log(msg, level=log.INFO)
		# log.msg(repr(pageItem).decode("unicode-escape") + '\n', level=log.INFO, spider=self)

		yield pageItem
Пример #6
0
    def parse(self, response):
        if response.status != 200:
            self.log("Response code: %d from %s" %
                     (response.status, response['url']),
                     level=log.WARNING)
            return
        # 解析url,获取域名
        netloc = urlparse(response.url).netloc
        domain, ret_type = domain_getter.get_domain(response.url)

        # 检查并获取domain对应的robots内容
        # robotsparser用以判断url是否被robots.txt内规则允许
        robotsparser, rulesetItem = self.getRuleset(domain)
        # 把rulesetItem送到pipeline
        yield rulesetItem

        #IP地址
        if ret_type == TYPE_IP:
            return

        pageItem = PageItem()
        # 使用xpath获取内容
        pageItem['url'] = response.url
        hxs = Selector(text=response.body)
        try:
            pageItem['title'] = hxs.xpath(
                '/html/head/title/text()').extract()[0]
        except IndexError:
            pageItem['title'] = '(crwal title failed)'
        pageItem['links'] = []
        pageItem['refused_links'] = []

        # 把所有<a>标签的超链接提取出来
        for link in hxs.xpath('//a/@href').extract():
            # 判断超链接内容是否为一个合法网址/相对网址
            if link.startswith('http://'):
                pageItem['links'].append(link)
                # robots判定
                if not robotsparser or robotsparser.is_allowed('*', link):
                    pageItem['links'].append(link)

                    #Schedule从redis队列中获取, 不在这里发起请求
                    # yield Request(link, callback=self.parse)
                else:
                    pageItem['refused_links'].append(link)

            elif link.startswith('/'):
                # 相对定位,把网址补全,再yield请求
                link = "http://" + netloc + link
                pageItem['links'].append(link)
                # robots判定
                if not robotsparser or robotsparser.is_allowed('*', link):
                    pageItem['links'].append(link)

                    #Schedule从redis队列中获取, 不在这里发起请求
                    # yield Request(link, callback=self.parse)
                else:
                    pageItem['refused_links'].append(link)
            else:
                # 不符合以上两种,一般为javascript函数
                # msg = '%s : not a url'%(link,)
                # self.log(msg, level=log.DEBUG)
                pass

        msg = 'crwal %d links from %s.' % (len(
            pageItem['links']), pageItem['url'])
        self.log(msg, level=log.INFO)
        # log.msg(repr(pageItem).decode("unicode-escape") + '\n', level=log.INFO, spider=self)

        yield pageItem