def tokenize(self):
     print "Tokenizing users.\n"
     for user, docs in self.users.iteritems():
         doc_lower = " ".join(docs).lower().replace("w/", "").replace("\n", "")
         doc_final = anticontract.expand_contractions(doc_lower)
         tokens = self.tokenizer.tokenize(doc_final)
         clean_tokens = [token for token in tokens if token not in self.stop_en]
         self.tokenized_docs[user] = clean_tokens
     print "Done tokenizing users.\n"
示例#2
0
	def tokenize(self):	
		"""
		Tokenize (extract unique tokens) all reviews given in self.reviews
		"""			
		print 'Tokenizing reviews.\n'
		for doc in self.reviews:
			raw_doc = doc['text'].replace("w/", "")
			raw_doc = raw_doc.replace("\n", "")
			doc_lower = raw_doc.lower()
			doc_final = anticontract.expand_contractions(doc_lower)
			tokens = self.tokenizer.tokenize(doc_final)
			clean_tokens = [token for token in tokens if token not in self.stop_en]
			self.tokenized_docs[doc['review_id']] = {'tokens': clean_tokens, 'user': doc['user_id']}
		print 'Done tokenizing reviews.\n'