Пример #1
0
class DataWriter(object):
	def __init__(self):
		self.f_url = open("data/url.txt", "a+")
		self.f_key = open("data/keywords.txt", "a+")
		self.f_mat = open("data/matrix.mtx", "a+")
		self.f_cla = open("data/classes.txt", "a+")
		self.r = Datastore()

		self.URL2ID = "URL2ID"
		self.ID2URL = "ID2URL"
		self.PROCESSED_CTR = "PROCESSED_CTR"

		'''l = enumerate(os.listdir("/home/nvdia/kernel_panic/core/config_data/classes_odp"))
		l = [(x[0] + 1, x[1]) for x in l]
		self.classes = dict(l)'''

	def process_item(self, item, spider):
		if item['shutdown']:
			self.f_url.close()
			self.f_key.close()
			self.f_mat.close()
			self.f_cla.close()
			self.r.set("POWER_SWITCH", "KILL")
			return item

		self.writeURL(item)
		self.writeKeywords(item)
		self.writeWebMatrix(item)
		#self.writeClasses(item)
		self.r.incr(self.PROCESSED_CTR, 1)
		return item

	def writeURL(self, item):
		self.f_url.write(item['url'] + "\n")

	def writeKeywords(self, item):
		for k in item['words']:
			self.f_key.write("%s," % k)
		self.f_key.write("\n")

	def writeWebMatrix(self, item):
		'''
		Builds web graph in matrix market format file
		'''
		u = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url'])))
		v = 0
		for link in set(item['link_set']):
			v = self.r.get("%s:%s" % (self.URL2ID, hashxx(link)))
			self.f_mat.write("%s\t%s\t1\n" % (u, v))

	def writeClasses(self, item):
		self.f_cla.write("%s:%s\n" % (item['title'], self.classes[item['predict'][0]]))
Пример #2
0
class KeywordExtractor(object):
	'''
	Extracts keywords from title, extracted_text, meta_description
	'''
	def __init__(self):
		self.r = Datastore()
		self.URL2ID = "URL2ID"
		self.WORD_SET = "WORD_SET"
		self.WORD2ID = "WORD2ID"
		self.WORD_IN = "WORD_IN"
		self.WORD_CTR = "WORD_CTR"
		#self.r.set(self.WORD_CTR, -1)
		self.stemmer = nltk.stem.PorterStemmer()
		self.stopwords = set([self.clean(x) for x in nltk.corpus.stopwords.words('english')])

	def process_item(self, item, spider):
		if item['shutdown']:
			return item

		print item['url']

		text = item['title'] + " . " + item['extracted_text'] + " . " + item['meta_description']
		words = [self.clean(x) for x in nltk.wordpunct_tokenize(text)]
		item['ordered_words'] = words
		cleaned_words = set(words) - self.stopwords
		cleaned_words = [self.clean(w) for w in cleaned_words if w.isalnum() and len(w) > 1 and not w.isdigit()]
		item['words'] = cleaned_words
		if not item['words']:
			raise DropItem

		self.buildWordIndex(item)

		return item

	def buildWordIndex(self, item):
		'''
		Get current url id
		For each word in current url's text,
			add the url to the set of urls which contain that word
		'''
		url_id = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url'])))
		word_id = ""
		for word in item['words']:
			if self.r.sadd(self.WORD_SET, word):
				word_id = str(self.r.incr(self.WORD_CTR, 1))
				self.r.set("%s:%s" % (self.WORD2ID, word), word_id)
			else:
				word_id = self.r.get("%s:%s" % (self.WORD2ID, word))
			self.r.sadd("%s:%s" % (self.WORD_IN, word_id), url_id)

	def clean(self, s):
		return self.stemmer.stem(s.lower())
Пример #3
0
class DuplicatesFilter(object):
	'''
	Filters duplicate urls
	Assigns each url a unique id
	Sets url -> id and id -> url in redis
	Indexes inlinks and outlinks
	'''
	def __init__(self):
		self.r = Datastore()
		self.URL2ID = "URL2ID"
		self.ID2URL = "ID2URL"
		self.URL_SET = "URL_SET"
		self.URL_CTR = "URL_CTR"
		self.MEM_THRESHOLD = 10 * (10 ** 9)
		self.redis_process = None
		self.scrapy_process = None
		#self.r.set(self.URL_CTR, -1)

		for i in psutil.process_iter():
			if i.name.find("redis-server") >= 0:
				self.redis_process = i
			if i.name.find("scrapy") >= 0:
				self.scrapy_process = i

	def process_item(self, item, spider):
		if not item:
			raise DropItem

		print "DuplicatesFilter:", item['url']

		if self.redis_process.get_memory_info().rss + self.scrapy_process.get_memory_info().rss > self.MEM_THRESHOLD:
			self.r.set("POWER_SWITCH", "OFF")
			item['shutdown'] = True

		if item['shutdown']:
			return item

		if not item['link_set']:
			raise DropItem
		else:
			self.buildURLIndex(item)
		return item

	def buildURLIndex(self, item):
		'''