def tokenize(self): print "Tokenizing users.\n" for user, docs in self.users.iteritems(): doc_lower = " ".join(docs).lower().replace("w/", "").replace("\n", "") doc_final = anticontract.expand_contractions(doc_lower) tokens = self.tokenizer.tokenize(doc_final) clean_tokens = [token for token in tokens if token not in self.stop_en] self.tokenized_docs[user] = clean_tokens print "Done tokenizing users.\n"
def tokenize(self): """ Tokenize (extract unique tokens) all reviews given in self.reviews """ print 'Tokenizing reviews.\n' for doc in self.reviews: raw_doc = doc['text'].replace("w/", "") raw_doc = raw_doc.replace("\n", "") doc_lower = raw_doc.lower() doc_final = anticontract.expand_contractions(doc_lower) tokens = self.tokenizer.tokenize(doc_final) clean_tokens = [token for token in tokens if token not in self.stop_en] self.tokenized_docs[doc['review_id']] = {'tokens': clean_tokens, 'user': doc['user_id']} print 'Done tokenizing reviews.\n'