Exemplo n.º 1
0
	def __init__(self, user_agents=[USER_AGENT_ZUMBOT], match_url=".+"):
		"""
		Init RobotsValidator class
		If no argument about match_url, 
		initialize user_agents=[DAUMOA, EDI], match_url=".+"(regular expression)
		The user_agents must be list or tuple.
		"""
		assert type(user_agents) in (list, tuple)
		self.robots = {}	# Dict of robot URLs to robot parsers
		self.match_url = re.compile(match_url)
		self.user_agents = user_agents
		self.lock = threading.Lock()
		self.url_util = URLUtil()
		self.blocked = False
		self.delay = 3
Exemplo n.º 2
0
class RobotsValidator:
	"""
	An object that handles checking if we should fetch and crawl a specific URL.
	This is based on the type of the URL (only crawl http URLs) and robot rules.
	Maintains a cache of robot rules already fetched.
	"""

	def __init__(self, user_agents=[USER_AGENT_ZUMBOT], match_url=".+"):
		"""
		Init RobotsValidator class
		If no argument about match_url, 
		initialize user_agents=[DAUMOA, EDI], match_url=".+"(regular expression)
		The user_agents must be list or tuple.
		"""
		assert type(user_agents) in (list, tuple)
		self.robots = {}	# Dict of robot URLs to robot parsers
		self.match_url = re.compile(match_url)
		self.user_agents = user_agents
		self.lock = threading.Lock()
		self.url_util = URLUtil()
		self.blocked = False
		self.delay = 3

	def __repr__(self):
		"""
		Print user_agents and internal robots cache size
		"""
		return "inputed user_agents: %s\ninternal robots cache size: %d" % (self.user_agents, len(self.robots))

	
	def __str__(self):
		"""
		Print user_agents and internal robots cache size
		"""
		ret_str = ""
		for key in self.robots.keys():
			ret_str += "%s:\n\t%s\n" % (key, "\n\t".join(str(self.robots[key]).split("\n")))
		return ret_str
		

	def flushInternalCache(self):
		"""
		Flush internal cache
		"""
		self.robots.clear()


	def flushAllCache(self):
		"""
		Flush all cache
		"""
		self.flushInternalCache()

	flush = flushAllCache
	
	def isDisallowSite(self, url, verbose=False):
		"""
		robots.txt가 아래 문구를 포함하면 True를 리턴.
			User-agent: * or zumbot
			Disallow: /
		"""
		logger = log.getLogger()
		self.delay = 3
		robots_site_path = urlparse.urljoin(url, "/robots.txt")	# Then the site-level
		# 3. download robots text
		self.blocked = False
		rules = self._parsingRobotsFile(robots_site_path)	# First try site-level
		if self.blocked:
			return True, self.delay
		else:
			return False, self.delay


	def isCrawlable(self, url, verbose=False, detail=False):
		"""
		Returns True if it's OK to crawl the absolute URL provided.
		"""
		logger = log.getLogger()

		# daum.net일 경우 무조건 True
		(scheme, netloc, path, param, query, fragment) = urlparse.urlparse(url)

		try:
			self.lock.acquire()

			ret = None

			if not url.startswith("http") or not self.match_url.match(url):
				if verbose:
					logger.info("not match with 'http' or match_url: %s" % self.match_url.match(url))
				ret = False

			if not ret:
				# clear internal cache if exceed MAX_ROBOTS_CACHE_SIZE
				if len(self.robots) > MAX_ROBOTS_CACHE_SIZE:
					self.robots.clear()
				#logger.debug("++ isCrawlable: A")

				robot_rules = self._getRules(url, verbose)
				#logger.debug("++ isCrawlable: B")
				if verbose:
					try:
						logger.info("URL: %s" % url)
						logger.info("ROBOT_RULES: \n%s" % robot_rules)
						#logger.info(" RULE: %s" % robot_rules.robots_file)
					except Exception, msg:
						logger.info("Exception: %s" % msg)

				parser = robotparser.RobotFileParser()
				parser.parse(robot_rules.robots_file)
				#logger.debug("++ isCrawlable: C")

				if robot_rules.is_useragent_configured:

					is_crawlable = True
					for user_agent in self.user_agents:
						is_crawlable = is_crawlable and parser.can_fetch(user_agent, url)

						if verbose:
							logger.info("maching: %s" % user_agent)
							logger.info("is_crawlable: %s" % is_crawlable)

					if verbose:
						logger.info("return %s" % is_crawlable)
					#return is_crawlable
					ret = is_crawlable
				else:
					#logger.debug("is_crawlable2: %s" % is_crawlable)
					is_crawlable = parser.can_fetch("*", url)

					if verbose:
						logger.info("is_crawlable3: %s" % is_crawlable)
						logger.info("is_crawlable4: %s" % parser.can_fetch("*", url))
						logger.info("return2 %s" % is_crawlable)
					ret = is_crawlable
		except:
			self.lock.release()
			if verbose: 
				logger.debug("++ robots_lock release")
		else:
			self.lock.release()
			if verbose: 
				logger.debug("++ robots_lock release")

		if not ret:
			ret = False

		# URLType이 TOP/Directory일 경우에는 무조건 방문한다(08/07/08)
		# TP가 False에서 True로 바꼈는지 여부를 판단하기 위해서 위치 수정(08/11/26)
		try:
			forced = False
			url_type = self.url_util.getURLType(url)
			if url_type in (URL_TYPE_TP, URL_TYPE_DR) and ret == False:
				logger.debug("**VISIT forced, URLType: %s", url_type)
				ret = True
				forced = True
		except:
			pass

		#logger.debug("++ is_crawlable: FINISH")
		if detail:
			ret = (ret, forced)

		return ret

	
	def _getRules(self, url, verbose=False):
		"""
		Returns the RobotTextRules object for url(site-level or dir-level)
		First:  use internal cache
		Second: use memcache
		Third:  download robots.txt and parsing 
		"""
		logger = log.getLogger()

		# 1. use stored robots dictionary cache
		robots_site_path = urlparse.urljoin(url, "/robots.txt")	# Then the site-level
		if robots_site_path in self.robots:
			if verbose:
				logger.info("robotstxt in local memory: %s", robots_site_path)
			return self.robots[robots_site_path]
		
		# 2. use memcache
		rules = None
		try:
			# 3. download robots text
			rules = self._parsingRobotsFile(robots_site_path)	# First try site-level
			if verbose:
				logger.info("robotstxt downloaded: %s: %s", rules.return_code, robots_site_path)
			self.robots[robots_site_path] = rules

		except:
			pass

		return rules
	

	def _parsingRobotsFile(self, url):
		"""
		Setup internal state after downloading robots.txt
		If urlopen.code in (401, 403), all user-agent is disallowed. -> 삭제
		If urlopen.code >= 400, all user-agent is allowd.
		"""
		domain_name = urlparse.urlparse(url)[1]
		rules = RobotsTextRules()
		#getLogger().debug("++Trying to download: %s", url)
		
		opener = mechanize.build_opener(mechanize.HTTPRefreshProcessor,)

		rq = mechanize.Request(url)
		rq.add_header("User-agent", "Mozilla/5.0 (compatible; Windows NT 6.1?; ZumBot/1.0; http://help.zum.com/inquiry)")


		#shttp.setRequestHeader(user_agent = USER_AGENT)
		rs = None
		try:
			rs = opener.open(rq)
			header = rs.info()
			rules.return_code = rs.code
		except Exception, msg:
			try:
				if not url.startswith("http://www."):
					t_url = url.replace("http://", "http://www.")
					rq = mechanize.Request(t_url )
					rq.add_header("User-agent", "Mozilla/5.0 (compatible; Windows NT 6.1?; ZumBot/1.0; http://help.zum.com/inquiry)")
					rs = opener.open(rq)
					header = rs.info()
					rules.return_code = rs.code
			except Exception, msg:
				return rules