def train(): positive_tweets = read_tweets('/root/295/new/positive.txt', 'positive') negative_tweets = read_tweets('/root/295/new/negative.txt', 'negative') print len(positive_tweets) print len(negative_tweets) #pos_train = positive_tweets[:2000] #neg_train = negative_tweets[:2000] #pos_test = positive_tweets[2001:3000] #neg_test = negative_tweets[2001:3000] pos_train = positive_tweets[:len(positive_tweets)*80/100] neg_train = negative_tweets[:len(negative_tweets)*80/100] pos_test = positive_tweets[len(positive_tweets)*80/100+1:] neg_test = negative_tweets[len(positive_tweets)*80/100+1:] training_data = pos_train + neg_train test_data = pos_test + neg_test sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_data]) #print all_words_neg unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) #print unigram_feats print len(unigram_feats) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sentim_analyzer.apply_features(training_data) test_set = sentim_analyzer.apply_features(test_data) print test_set trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, training_set) for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value)) print sentim_analyzer.classify(tokenize_sentance('I hate driving car at night')) return sentim_analyzer
senti = line.split(",")[0] content = line[len(senti)+1:] tokens = word_tokenize(content.rstrip()) trainingset.append((tokens,senti)) all_words_neg = sa.all_words([mark_negation(doc) for doc in trainingset]) unigram_feats = sa.unigram_word_feats(all_words_neg,min_freq = 4) sa.add_feat_extractor(extract_unigram_feats,unigrams=unigram_feats) training_set = sa.apply_features(trainingset) for line in sys.stdin: if "username" in line: continue tweetWords=[] tweet= line.split(";")[4] likes = line.split(";")[3] likes = int(likes) if likes==0: num=1 else: num = 1+likes words = tweet.split() for i in words: i = i.lower() i = i.strip('@#\'"?,.!') tweetWords.append(i) sentiment = sa.classify(tweetWords) print '%s\t%s' % (sentiment, str(num))
"southwestair": "Southwest Airlines", "delta": "Delta" } for airline_name in ["americanair", "united", "southwestair", "delta"]: with open("case_study_dataset_{}.csv".format(airline_name), "r") as file_handle: next(file_handle) # Skip the header dataset = list() for line in file_handle.readlines(): dataset.append( nltk.word_tokenize( tweet_preprocessor.clean(html.unescape(line)))) positive_sentiment_count = 0 negative_sentiment_count = 0 for tweet in dataset: sentiment_score = sentim_analyzer.classify(tweet) if sentiment_score == 0: negative_sentiment_count = negative_sentiment_count + 1 else: positive_sentiment_count = positive_sentiment_count + 1 results_dict["Airline"].append(airline_friendly_name_map[airline_name]) results_dict["Positive Sentiment Count"].append(positive_sentiment_count) results_dict["Negative Sentiment Count"].append(negative_sentiment_count) pd.DataFrame(results_dict).to_csv( "case_study_naive_bayes_classifier_with_emojis.csv")
print len(test), len(train) sentiment_analyzer = SentimentAnalyzer() all_words = sentiment_analyzer.all_words([doc[0] for doc in train]) # # Get list of terms+frequencies # words_freqs = {} # for tweet in train: # for token in tweet[0]: # if token in words_freqs: # words_freqs[token] += 1 # else: # words_freqs[token] = 1 # unigrams = [token for token in words_freqs if words_freqs[token] >= 4] unigrams = sentiment_analyzer.unigram_word_feats(all_words, min_freq=4) #bigrams = sentiment_analyzer.bigram_collocation_feats([doc[0] for doc in train], top_n=1000) # print unigrams sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigrams) #sentiment_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigrams) training_set=sentiment_analyzer.apply_features(train) test_set=sentiment_analyzer.apply_features(test) #print training_set[0] trainer = NaiveBayesClassifier.train classifier = sentiment_analyzer.train(trainer, training_set) save_file(sentiment_analyzer, "sentiment_classifier.pkl") for key,value in sorted(sentiment_analyzer.evaluate(test_set).items()): print("{0}: {1}".format(key,value)) print test[0], sentiment_analyzer.classify(test[0][0])
class DiplomaSentimentAnalyzer: def __init__(self, n_instances=500): self.n_instances = n_instances self.subj_classifier = None self.sentim_analyzer = None try: BASE_DIR = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) with open(os.path.join(BASE_DIR, 'main\my_classifier.pickle'), 'rb') as f: sentim_analyzer = pickle.load(f) self.sentim_analyzer = sentim_analyzer except IOError: with open('plot.tok.gt9.5000') as obj_sents: obj_sents = obj_sents.read() with open('quote.tok.gt9.5000') as subj_sents: subj_sents = subj_sents.read() self.obj_sents = obj_sents self.sentim_analyzer = SentimentAnalyzer() self.train_diploma(subj_sents, obj_sents) def prepair_train_data(self, sents, category): stop_words = set(stopwords.words('english')) sents_processed = sent_tokenize(sents) sents_final = [] for k in sents_processed: sent = word_tokenize(k) sent = [w.lower() for w in sent if w not in stop_words] lemmatizer = WordNetLemmatizer() sent = [lemmatizer.lemmatize(j) for j in sent] sents_final.append(sent) sents_final = [(i, category) for i in sents_final] return sents_final def train_diploma(self, subj_sents, obj_sents): train_subj = self.prepair_train_data(subj_sents, 'subj')[:self.n_instances] train_obj = self.prepair_train_data(obj_sents, 'obj')[:self.n_instances] training_docs = train_subj + train_obj all_words_neg = self.sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs]) unigram_feats = self.sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) self.sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = self.sentim_analyzer.apply_features(training_docs) trainer = NaiveBayesClassifier.train subj_classifier = self.sentim_analyzer.train(trainer, training_set) self.subj_classifier = subj_classifier f = open('my_classifier.pickle', 'wb') pickle.dump(self.sentim_analyzer, f) f.close() def get_sentiment_values(self, text): result = {} if text: stop_words = set(stopwords.words('english')) sentences = word_tokenize(text) sentences = [w.lower() for w in sentences if w not in stop_words] lemmatizer = WordNetLemmatizer() sentences = [lemmatizer.lemmatize(j) for j in sentences] subj_count = 0 for i in sentences: a = self.sentim_analyzer.classify({i, True}) if a == 'subj': subj_count += 1 result['subjectivity'] = round( (subj_count / len(sentences)), 2) * 100 sid = SentimentIntensityAnalyzer() polarity = sid.polarity_scores(text)['compound'] result['polarity'] = (polarity / 2 + 0.5) * 100 return result
print len(test), len(train) sentiment_analyzer = SentimentAnalyzer() all_words = sentiment_analyzer.all_words([doc[0] for doc in train]) # # Get list of terms+frequencies # words_freqs = {} # for tweet in train: # for token in tweet[0]: # if token in words_freqs: # words_freqs[token] += 1 # else: # words_freqs[token] = 1 # unigrams = [token for token in words_freqs if words_freqs[token] >= 4] unigrams = sentiment_analyzer.unigram_word_feats(all_words, min_freq=4) #bigrams = sentiment_analyzer.bigram_collocation_feats([doc[0] for doc in train], top_n=1000) # print unigrams sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigrams) #sentiment_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigrams) training_set = sentiment_analyzer.apply_features(train) test_set = sentiment_analyzer.apply_features(test) #print training_set[0] trainer = NaiveBayesClassifier.train classifier = sentiment_analyzer.train(trainer, training_set) save_file(sentiment_analyzer, "sentiment_classifier.pkl") for key, value in sorted(sentiment_analyzer.evaluate(test_set).items()): print("{0}: {1}".format(key, value)) print test[0], sentiment_analyzer.classify(test[0][0])
for line in f: senti = line.split(",")[0] content = line[len(senti) + 1:] tokens = word_tokenize(content.rstrip()) trainingset.append((tokens, senti)) all_words_neg = sa.all_words([mark_negation(doc) for doc in trainingset]) unigram_feats = sa.unigram_word_feats(all_words_neg, min_freq=4) sa.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) training_set = sa.apply_features(trainingset) for line in sys.stdin: if "username" in line: continue tweetWords = [] tweet = line.split(";")[4] likes = line.split(";")[3] likes = int(likes) if likes == 0: num = 1 else: num = 1 + likes words = tweet.split() for i in words: i = i.lower() i = i.strip('@#\'"?,.!') tweetWords.append(i) sentiment = sa.classify(tweetWords) print '%s\t%s' % (sentiment, str(num))
class SuicideClassifier(object): def __init__(self, sentiment_only, num_phrases_to_track=20): # neg_phrases = filter_negative_phrases(load_csv_sentences('thoughtsandfeelings.csv')) # pos_phrases = filter_positive_phrases(load_csv_sentences('spiritualforums.csv')) # file_pos = open("pos_phrases.txt", 'w') # file_neg = open("neg_phrases.txt", 'w') # for item in pos_phrases: # print>>file_pos, item # for item in neg_phrases: # print>>file_neg, item self.recent_sentiment_scores = [] neg_file = open("ALL_neg_phrases_filtered.txt", "r") pos_file = open("webtext_phrases_with_lots_of_words.txt", "r") neg_phrases = neg_file.readlines() pos_phrases = pos_file.readlines() neg_docs = [] pos_docs = [] for phrase in neg_phrases: neg_docs.append((phrase.split(), 'suicidal')) for phrase in pos_phrases[:len(neg_phrases)]: pos_docs.append((phrase.split(), 'alright')) print len(neg_docs) print len(pos_docs) # negcutoff = len(neg_docs) * 3 / 4 # poscutoff = len(pos_docs) * 3 / 4 negcutoff = -200 poscutoff = -200 train_pos_docs = pos_docs[:poscutoff] test_pos_docs = pos_docs[poscutoff:] train_neg_docs = neg_docs[:negcutoff] test_neg_docs = neg_docs[negcutoff:] training_docs = train_pos_docs + train_neg_docs testing_docs = test_pos_docs + test_neg_docs self.sentim_analyzer = SentimentAnalyzer() if not sentiment_only: all_words = self.sentim_analyzer.all_words([doc for doc in training_docs]) unigram_feats = self.sentim_analyzer.unigram_word_feats(all_words, min_freq=1) self.sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) self.sentim_analyzer.add_feat_extractor(vader_sentiment_feat) # bigram_feats = self.sentim_analyzer.bigram_collocation_feats(all_words, min_freq=1) # self.sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_feats) training_set = self.sentim_analyzer.apply_features(training_docs) test_set = self.sentim_analyzer.apply_features(testing_docs) trainer = NaiveBayesClassifier.train self.classifier = self.sentim_analyzer.train(trainer, training_set) for key, value in sorted(self.sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value)) self.classifier.show_most_informative_features(20) def test(self, phrase): return self.sentim_analyzer.classify(phrase.split()) def update_sentiments(self, value): now = datetime.datetime.now() self.recent_sentiment_scores.append([now, value]) self.recent_sentiment_scores = [x for x in self.recent_sentiment_scores if x[ 0] > now - datetime.timedelta(seconds=60)] print sum([x[1] for x in self.recent_sentiment_scores]) / len(self.recent_sentiment_scores) return sum([x[1] for x in self.recent_sentiment_scores]) / len(self.recent_sentiment_scores)