예제 #1
0
 def processBody(self):
     queue = ready_queue(self.url, self.body)
     #print "found %i links to queue" % len(queue)
     self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
     if len(self.text) > 5000:
         offset = 0
         i = 0
         l = []
         while True:
             j = self.findnth(self.text[i:],' ',500)
             offset += j
             if j == -1:
                 break
             l.append(self.text[i:j])
             i = offset + j+1
         logger.debug("processing with %i threads" % len(l))
         try:
             if len(l) == 0:
                 return []
             pool = Pool(processes=(len(l)))
             self.keyword_dicts = pool.map(rankKeywords, l)
         except KeyboardInterrupt:
             pool.terminate()
             pool.join()
             sys.exit()
         else:
             pool.close()
             pool.join()
         logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
     else:
         self.keyword_dicts.append(rankKeywords(self.text))
     return queue
예제 #2
0
 def processBody(self):
     queue = ready_queue(self.url, self.body)
     #print "found %i links to queue" % len(queue)
     self.text = stripPunctuation(
         self.remove_html_tags(stripScript(self.body)))
     if len(self.text) > 5000:
         offset = 0
         i = 0
         l = []
         while True:
             j = self.findnth(self.text[i:], ' ', 500)
             offset += j
             if j == -1:
                 break
             l.append(self.text[i:j])
             i = offset + j + 1
         logger.debug("processing with %i threads" % len(l))
         try:
             if len(l) == 0:
                 return []
             pool = Pool(processes=(len(l)))
             self.keyword_dicts = pool.map(rankKeywords, l)
         except KeyboardInterrupt:
             pool.terminate()
             pool.join()
             sys.exit()
         else:
             pool.close()
             pool.join()
         logger.debug("processed, returned %i dicts" %
                      len(self.keyword_dicts))
     else:
         self.keyword_dicts.append(rankKeywords(self.text))
     return queue
예제 #3
0
	def process(self):
		text_lower = self.text.lower()
		temptitle = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
		self.title = temptitle[temptitle.find('>'):len(temptitle)]

		self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
		self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
		self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))

		self.links=ready_queue(self.url, self.body)
		return self.links
예제 #4
0
 def processBody(self):
     queue = ready_queue(self.url, self.body)
     #print "found %i links to queue" % len(queue)
     self.text = stripPunctuation(
         self.remove_html_tags(stripScript(self.body)))
     if len(self.text) > 5000:
         offset = 0
         i = 0
         l = []
         cont = True
         while cont:
             #this divides the text into sets of 500 words
             #set j to the index of the last letter of the 500th word
             j = self.findnth(self.text[i:], ' ', 500)
             #if only 500 words or less are left
             if j == -1:
                 cont = False
             #Should append a string that contains 500 words for each loop(except the last loop) to l
             #last loop should append a string with 500 words or less to l
             l.append(self.text[i:i + j])
             i += j + 1
         logger.debug("processing with %i threads" % len(l))
         try:
             if len(l) == 0:
                 return []
             pool = Pool(processes=(len(l)))
             self.keyword_dicts = pool.map(rankKeywords, l)
         except KeyboardInterrupt:
             pool.terminate()
             pool.join()
             sys.exit()
         else:
             pool.close()
             pool.join()
         logger.debug("processed, returned %i dicts" %
                      len(self.keyword_dicts))
     else:
         self.keyword_dicts.append(rankKeywords(self.text))
     return queue
예제 #5
0
	def processBody(self):
		queue = ready_queue(self.url, self.body)
		#print "found %i links to queue" % len(queue)
		self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
		if len(self.text) > 5000:
			offset = 0
			i = 0
			l = []
			cont = True
			while cont:
				#this divides the text into sets of 500 words
				#set j to the index of the last letter of the 500th word
				j = self.findnth(self.text[i:],' ',500)
				#if only 500 words or less are left
				if j == -1:
					cont = False
				#Should append a string that contains 500 words for each loop(except the last loop) to l
				#last loop should append a string with 500 words or less to l
				l.append(self.text[i:i+j])
				i += j+1
			logger.debug("processing with %i threads" % len(l))
			try:
				if len(l) == 0:
					return []
				pool = Pool(processes=(len(l)))
				self.keyword_dicts = pool.map(rankKeywords, l)
			except KeyboardInterrupt:
				pool.terminate()
				pool.join()
				sys.exit()
			else:
				pool.close()
				pool.join()
			logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
		else:
			self.keyword_dicts.append(rankKeywords(self.text))
		return queue