def string_to_bag_of_words(self, text): text = RegexpTokenizer(r'\w+').tokenize(text.lower()) stop_words = set(nltk.corpus.stopwords.words('english')) return Counter([ WordNetLemmatizer().lemmatize(x) for x in text if not x in stop_words ])
def getAllReviews(movieList): reviews = np.array(map(lambda x: x["reviews"], movieList)) reviews = np.concatenate(reviews) tokenizeReview = [] for review in reviews: s = review['review'] s = RegexpTokenizer(r'\w+').tokenize(s.lower()) s = map(lambda x: PorterStemmer().stem(x), s) s = filter(lambda x: x not in stopwords.words('english'), s) tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg')) return tokenizeReview
def getAllCritics(movieList): reviews = np.array(map(lambda x: x["critics"], movieList)) reviews = np.concatenate(reviews) tokenizeReview = [] for review in reviews: s = review['review'] s = RegexpTokenizer(r'\w+').tokenize(s.lower()) s = map(lambda x: PorterStemmer().stem(x), s) s = filter(lambda x: x not in stopwords.words('english'), s) tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg')) return tokenizeReview
def string_to_bag_of_words(self, text): text = RegexpTokenizer(r'\w+').tokenize(text.lower()) if not(self.enable_stemming) and not(self.filter_stopwords): return Counter(text) elif not(self.enable_stemming) and self.filter_stopwords: stop_words = set(nltk.corpus.stopwords.words('english')) return Counter([x for x in text if not x in stop_words]) elif self.enable_stemming and not(self.filter_stopwords): return Counter([WordNetLemmatizer().lemmatize(x) for x in text]) else: stop_words = set(nltk.corpus.stopwords.words('english')) return Counter([WordNetLemmatizer().lemmatize(x) for x in text if not x in stop_words])