stemmed_words = [stemmer.stem(word) for word in word_tokenize(doc)] return stemmed_words import pandas as pd import numpy as np df = pd.read_csv('tweets.csv') target = df['is_there_an_emotion_directed_at_a_brand_or_product'] text = df['tweet_text'] fixed_text = text[pd.notnull(text)] fixed_target = target[pd.notnull(text)] count_vect = CountVectorizer(tokenizer=tokenizer) count_vect.fit(fixed_text) counts = count_vect.transform(fixed_text) from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() from sklearn.model_selection import cross_val_score, cross_val_predict scores = cross_val_score(nb, counts, fixed_target) print(scores) print(scores.mean()) predictions = cross_val_predict(nb, counts, fixed_target) log(run, fixed_text, fixed_target, predictions)
df = pd.read_csv('tweets.csv') # Get pandas Series object of the "tweet text" column: text = df['tweet_text'] # Get pandas Series object of the "emotion" column: target = df['is_there_an_emotion_directed_at_a_brand_or_product'] # Remove the blank rows from the series: target = target[pd.notnull(text)] text = text[pd.notnull(text)] # Perform feature extraction: from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() count_vect.fit(text) counts = count_vect.transform(text) # Train with this data with a Naive Bayes classifier: from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(counts, target) # See what the classifier predicts for some new tweets: predictions = nb.predict(counts) print(len(predictions)) correct_predictions = sum(predictions == target) print('Percent correct: ', 100.0 * correct_predictions / len(predictions)) log(text, target, predictions)