Пример #1
0
class KeywordExtractor(object):
	'''
	Extracts keywords from title, extracted_text, meta_description
	'''
	def __init__(self):
		self.r = Datastore()
		self.URL2ID = "URL2ID"
		self.WORD_SET = "WORD_SET"
		self.WORD2ID = "WORD2ID"
		self.WORD_IN = "WORD_IN"
		self.WORD_CTR = "WORD_CTR"
		#self.r.set(self.WORD_CTR, -1)
		self.stemmer = nltk.stem.PorterStemmer()
		self.stopwords = set([self.clean(x) for x in nltk.corpus.stopwords.words('english')])

	def process_item(self, item, spider):
		if item['shutdown']:
			return item

		print item['url']

		text = item['title'] + " . " + item['extracted_text'] + " . " + item['meta_description']
		words = [self.clean(x) for x in nltk.wordpunct_tokenize(text)]
		item['ordered_words'] = words
		cleaned_words = set(words) - self.stopwords
		cleaned_words = [self.clean(w) for w in cleaned_words if w.isalnum() and len(w) > 1 and not w.isdigit()]
		item['words'] = cleaned_words
		if not item['words']:
			raise DropItem

		self.buildWordIndex(item)

		return item

	def buildWordIndex(self, item):
		'''
		Get current url id
		For each word in current url's text,
			add the url to the set of urls which contain that word
		'''
		url_id = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url'])))
		word_id = ""
		for word in item['words']:
			if self.r.sadd(self.WORD_SET, word):
				word_id = str(self.r.incr(self.WORD_CTR, 1))
				self.r.set("%s:%s" % (self.WORD2ID, word), word_id)
			else:
				word_id = self.r.get("%s:%s" % (self.WORD2ID, word))
			self.r.sadd("%s:%s" % (self.WORD_IN, word_id), url_id)

	def clean(self, s):
		return self.stemmer.stem(s.lower())
Пример #2
0
class RequestsLimiter(object):
	def __init__(self):
		self.r = Datastore()
		#self.r.flushdb()
		self.DOMAIN = "DOMAIN"
		self.LIMIT = 200
		self.DOMAIN_SET = "DOMAIN_SET"

	def process_request(self, request, spider):
		try:
			domain = urlparse(request.url).hostname
			if int(self.r.get(self.DOMAIN + ":" + domain) or 0) < self.LIMIT:
				self.r.sadd(self.DOMAIN_SET, domain)
				self.r.incr(self.DOMAIN + ":" + domain, 1)
				return None
			else:
				log.msg("DOMAIN limit Crossed:%s" % request.url, level=log.CRITICAL)
				raise IgnoreRequest
		except TypeError as e:
			raise IgnoreRequest


	def process_response(self, request, response, spider):
		try:

			if 'text/html' not in response.headers['Content-Type'] and 'text/plain' not in response.headers['Content-Type']:
				log.msg("Non-HTML/Plain:%s" % request.url, level=log.CRITICAL)
				raise IgnoreRequest

			if langid.classify(response.body)[0] != 'en':
				log.msg("Non-English:%s" % request.url, level=log.CRITICAL)
				raise IgnoreRequest
		except KeyError:
			log.msg("KeyError(Content-Type):%s" % request.url, level=log.CRITICAL)
			raise IgnoreRequest

		del request
		return response