def transform(raw_reviews: pd.DataFrame) -> pd.DataFrame: """ Applies sentiment analysis on the reviews. """ transformed_reviews = raw_reviews.copy() # prepare the sentiment analyzer nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIRECTORY) nltk.data.path.append(NLTK_DATA_DIRECTORY) sentiment_analyzer = SentimentIntensityAnalyzer() sentiments = [] for _, review in transformed_reviews["comments"].items(): sentiment_compound = sentiment_analyzer.polarity_scores( review)["compound"] # decide sentiment as positive, negative and neutral if sentiment_compound >= 0.05: sentiment = "positive" elif sentiment_compound <= -0.05: sentiment = "negative" else: sentiment = "neutral" sentiments.append(sentiment) transformed_reviews["sentiments"] = sentiments return transformed_reviews
def Analyze(self, request, context): """ The method that will be exposed to the snet-cli call command. :param request: incoming data :param context: object that provides RPC-specific information (timeout, etc). :return: """ # In our case, request is a InputMessage() object (from .proto file) self.value = request.value # Convert in json array sentence_list = json.loads(self.value) # Result list result_list = [] # Sentiment Analyser Instance analizer = SentimentIntensityAnalyzer() for sentence_item in sentence_list: # Classifying sentences analysis = str(analizer.polarity_scores(sentence_item["sentence"])) result_list.append({ "id": sentence_item["id"], "analysis": analysis }) # To respond we need to create a OutputMessage() object (from .proto file) self.result = OutputMessage() self.result.value = json.dumps(result_list) return self.result
def sentiment_score(text): list_text = text.split('.') s = SentimentIntensityAnalyzer() list_scores = [] for sentence in list_text: list_scores.append(s.polarity_scores(sentence)['compound']) return list_scores
def graph_eng(): file = filedialog.askopenfilename(filetypes=(("Text files", "*.txt"), ("all files", "*.*"))) f = open(file) raw = f.read() sentences = nltk.sent_tokenize(raw) sid = SentimentIntensityAnalyzer() positive_values = [] for sentence in sentences: ss = sid.polarity_scores(sentence) pos_ss = ss.get('pos') positive_values.append(pos_ss) summary = len(positive_values) negative_values = [] for sentence in sentences: ss = sid.polarity_scores(sentence) neg_ss = ss.get('neg') negative_values.append(neg_ss) n_value = np.array(negative_values) p_value = np.array(positive_values) counts_value = np.arange(summary) plt.plot(counts_value, p_value, n_value) plt.show()
def score_data(data): ''' Computes VADER sentiment scores for every string in the data passed in. Input: -data: a pandas Series object containing strings to be scored Returns: pos, neu, neg, com list objects containing the respective scores for each string ''' sid = SentimentIntensityAnalyzer() pos = [] neu = [] neg = [] com = [] for i in range(data.shape[0]): score = sid.polarity_scores(data[i]) pos.append(score['pos']) neu.append(score['neu']) neg.append(score['neg']) com.append(score['compound']) return pos, neu, neg, com
def perform_sentiment_analysis(text): sid = SentimentIntensityAnalyzer() scores = sid.polarity_scores(text) strongest = max(scores, key=scores.get) sentiment = {'neg': 'Negative', 'neu': 'Neutral', 'pos': 'Positive'} return sentiment[strongest]
def sentiment_analysis_per_sentence(self): """This function takes the json_object from the /sentiment-analysis-long url. It will perform a more detailed analysis on larger text structures. my dict object has the following parameters: title: title of something they want analyzed so that the data has a heading input_text: the actual text being analyzed Good parsing tip from: https://stackoverflow.com/questions/17618149/divide-string-by-line-break-or-period-with-python-regular-expressions""" if self.json_object['input_text'] is None or self.json_object is None: return self.json_object else: input_text = self.json_object['input_text'] sentences = [x for x in map(str.strip, input_text.split('.')) if x] sia = SentimentIntensityAnalyzer() # Getting the values: # Setting a list to hold all the sentiment_scores sentiment_scores = [] for sentence in sentences: sentiment_scores.append(sia.polarity_scores(sentence)) output_objects = self.sentiment_analysis_sentence_stats( sentiment_scores ) # summoning the following function to perform the statistics return output_objects
def SentimentAnalysis(_arg1, library='nltk'): ''' Sentiment Analysis is a procedure that assigns a score from -1 to 1 for a piece of text with -1 being negative and 1 being positive. For more information on the function and how to use it please refer to tabpy-tools.md ''' if not (isinstance(_arg1[0], str)): raise TypeError library = library.lower() supportedLibraries = {'nltk', 'textblob'} if library not in supportedLibraries: raise ValueError scores = [] if library == 'nltk': sid = SentimentIntensityAnalyzer() for text in _arg1: sentimentResults = sid.polarity_scores(text) score = sentimentResults['compound'] scores.append(score) elif library == 'textblob': for text in _arg1: currScore = TextBlob(text) scores.append(currScore.sentiment.polarity) return scores
def get_sentiment_score(df, sentiment_command): if sentiment_command == "Yes": def join_sentiment_text(row): row = " ".join(row) return row df['sentiment_text'] = df['cleaned_text'].apply(join_sentiment_text) # Instatiating the sentiment intensity analyzer - sid = SentimentIntensityAnalyzer() # Finding sentiment of each tweet - df['sentiment_score'] = df['sentiment_text'].apply( lambda review: sid.polarity_scores(review)) # Getting the sentiment from dictionary - def get_sentiment(score_dict): if score_dict['compound'] > 0.2: return 'Positive' elif score_dict['compound'] < -0.2: return 'Negative' else: return 'Neutral' # Storing the sentiment in a separate column df['sentiment'] = df['sentiment_score'].apply(get_sentiment) df.drop(['sentiment_text', 'sentiment_score'], axis=1, inplace=True) return df
def __init__(self): #document represented by a tuple (sentence,labelt) n_instances = 100 subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] #split subj and objinstances to keep a balanced uniform class distribution in both train and test sets. train_subj_docs = subj_docs[:80] test_subj_docs = subj_docs[80:100] train_obj_docs = obj_docs[:80] test_obj_docs = obj_docs[80:100] training_docs = train_subj_docs+train_obj_docs testing_docs = test_subj_docs+test_obj_docs #train classifier sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) #use simple unigram word features, handling negation unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) #apply features to obtain a feature_value representations of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) self.trainer = NaiveBayesClassifier.train self.classifier = sentim_analyzer.train(self.trainer, training_set) for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): print('{0}: {1}'.format(key, value)) self.sid = SentimentIntensityAnalyzer()
def intensivityAnalysis(self, request, context): # In our case, request is a InputMessage() object (from .proto file) self.value = request.value analizer = SentimentIntensityAnalyzer() text = base64.b64decode(self.value) # Decode do string temp = text.decode('utf-8') # Convert in array tempArray = temp.split("\n") # Result of sentences stringResult = '' # Generating result for line in tempArray: if line is not None: if len(line) > 1: stringResult += line stringResult += '\n' stringResult += str(analizer.polarity_scores(line)) stringResult += '\n\n' # Encoding result resultBase64 = base64.b64encode(str(stringResult).encode('utf-8')) # To respond we need to create a OutputMessage() object (from .proto file) self.result = OutputMessage() self.result.value = resultBase64 # log.debug('add({},{})={}'.format(self.a, self.b, self.result.value)) return self.result
def sentiment_analysis(blurb, index): # Empty list is created for storing the results of the next segment # print(index, " ", blurb) # analyzed_sentences = [] # We loop through all the reviews that we import from the file # A dictionary is created to store the data of one sentence temporarily data = {'compound': 0, 'neu': 0, 'neg': 0, 'pos': 0} # Reviews are taken, one at a time, from the review texts list # blurb = df['blurb'][index] # And then the review is separated into sentences sentence_list = nltk.tokenize.sent_tokenize(blurb) # Then, Vader Analyzer from the NLTK Library is used to do a sentiment analysis of each of the sentences obtained # from the review. This analyzer gives us four parameters in the result: Compound, Neutral, Positive and Negative vader_analyzer = SentimentIntensityAnalyzer() for text in sentence_list: temp = vader_analyzer.polarity_scores(text) for key in ('compound', 'neu', 'neg', 'pos'): # Here, an average of the parameters is taken for all the sentences obtained from the review to find the # Vader Analysis scores for the review if sentence_list.__len__() is not 0: data[key] += temp[key] / sentence_list.__len__() # We add all the analysis scores in a list for later use return (index, data)
def analyze_sentiment_vader_lexicon(review, threshold=0.1, verbose=False): review = tn.strip_html_tags(review) review = tn.remove_accented_chars(review) review = tn.expand_contractions(review) analyze = SentimentIntensityAnalyzer() scores = analyze.polarity_scores(review) agg_score = scores["compound"] final_sentiment = "positive" if agg_score >= threshold else "negative" if verbose: positive = str(round(scores['pos'], 2) * 100) + "%" final = round(agg_score, 2) negative = str(round(scores['neg'], 2) * 100) + "%" neutral = str(round(scores['neu'], 2) * 100) + "%" sentiment_frame = pd.DataFrame( [[final_sentiment, final, positive, negative, neutral]], columns=pd.MultiIndex(levels=[["SENTIMENT STATS: "], [ "Predicted Sentiment ", "Polarity Score", "Positive", "Negative", "Neutral" ]], labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]])) print(sentiment_frame) return final_sentiment
def get_sentiment_features(input_file, output): # Positive words output['positive_num'] = (input_file['posemo'] / 100) * input_file['WC'] output['positive_prop'] = input_file['posemo'] # Negative Words output['negative_num'] = (input_file['negemo'] / 100) * input_file['WC'] output['negative_prop'] = input_file['negemo'] # Anxiety words output['anxiety_num'] = (input_file['anx'] / 100) * input_file['WC'] output['anxiety_prop'] = input_file['anx'] # Anger Words output['anger_num'] = (input_file['anger'] / 100) * input_file['WC'] output['anger_prop'] = input_file['anger'] # Sadness Words output['sadness_num'] = (input_file['sad'] / 100) * input_file['WC'] output['sadness_prop'] = input_file['sad'] # Overall Emotional Words output['overall_emotional_num'] = (input_file['affect'] / 100) * input_file['WC'] output['overall_emotional_prop'] = input_file['affect'] senti_analyser = SentimentIntensityAnalyzer() # pass unprocessed text to sentiment analyser, but remove new lines and dashes (\n and -) output['average_sentiment_of_word'] = input_file['unprocessed_text'].apply( lambda x: senti_analyser.polarity_scores(" ".join( re.sub(r'[^\w\s!?.]', "", x).splitlines()))['compound']) return output
def analysis(): sid = SentimentIntensityAnalyzer() comments = ["islam is the worst religion"] for comment in comments: sentiment = sid.polarity_scores(comment) print(sentiment) print(word_tokenize(comment))
def extract_features(text): wordcount_pos = 0 wordcount_neg = 0 bigram_count_pos = 0 bigram_count_neg = 0 compound_scores = list() positive_scores = list() sia = SentimentIntensityAnalyzer() for sentence in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sentence): if word.lower() in top_100_positive: wordcount_pos += 1 if word.lower() in top_100_negative: wordcount_neg += 1 if word in positive_bigram_finder.word_fd: bigram_count_pos += 1 if word in negative_bigram_finder.word_fd: bigram_count_neg += 1 compound_scores.append(sia.polarity_scores(sentence)["compound"]) positive_scores.append(sia.polarity_scores(sentence)["pos"]) # Adding 1 to the final compound score to always have positive numbers # since some classifiers you'll use later don't work with negative numbers. curr_features = [ mean(compound_scores) + 1, mean(positive_scores), wordcount_pos, wordcount_neg, bigram_count_pos, bigram_count_neg ] return curr_features
def sentiment_analyzer(input_data: dict) -> None: """Check package for spelling errors.""" nltk.download('vader_lexicon') info("Processing tasks with sentimentanalyzer plugin now!") sid = SentimentIntensityAnalyzer() package_export_content_modules = get_value(CONTENT_MOD_STRING, input_data)[CONTENT_MOD_STRING] for values in package_export_content_modules: raw_task_data = get_task_data_listed(package_export_content_modules, values) for package in raw_task_data: for titles, task_item in raw_task_data[package].items(): line_count = 0 line_item = task_item.split("\n") for task_line_item in line_item: line_count += 1 test_search = "data:image\/\S{1,4};base64" x = re.findall(test_search, task_line_item) if len(x) < 1 < len(task_line_item): print( f"Package: {package}\nTask Title: {titles}\nLine Count: {line_count}\nSentence Analyzed:" f" {task_line_item}") kvp = sid.polarity_scores(task_line_item) for k in kvp: print(f"{k}: {kvp[k]}") print()
def get_comments_and_parents(post): post.comments.replace_more(limit=None) comments = post.comments.list() vader_analyzer = SentimentIntensityAnalyzer() parents = [] parents_scores = [] rtn_comments = [] scores = [] for comment in comments: comment_parent = comment.parent() comment_scores = vader_analyzer.polarity_scores(comment.body) comment_scores_lst = [ comment_scores["neg"], comment_scores["neu"], comment_scores["pos"], comment_scores["compound"] ] scores += [comment_scores_lst] try: parents += [comment_parent.body] parent_scores = vader_analyzer.polarity_scores(comment_parent.body) parent_scores_lst = [ parent_scores["neg"], parent_scores["neu"], parent_scores["pos"], parent_scores["compound"] ] parents_scores += [parent_scores_lst] rtn_comments += [comment.body] except AttributeError: pass return ([comment.body for comment in comments], scores, [comment.score for comment in comments]), (parents, parents_scores, rtn_comments)
def post_request_user(): json_object = request.get_json() sa = Sentiment(json_object) string_plot = sa.get_string_from_object()['input_text'] filtered_txt = sa.remove_stopwords(string_plot) sia = SentimentIntensityAnalyzer() dict_out = sia.polarity_scores(filtered_txt) return dict_out
def init_model(): # Train Catboost model to predict number of streams global model print('********* Training model... *********') official_competition_dataset = pd.read_csv('https://datahack2020dataset.s3.us-east-2.amazonaws.com/OfficialCompetitionDataset.csv') numerical_cols = ['auditory', 'beats_per_measure', 'beats_per_min', 'concert_probability', 'danceability', 'hype', 'instrumentalness', 'length_minutes', 'lyricism', 'nplays', 'positivity', 'volume'] # no hotness and critic rating because unlikely for amateur band categorical_features = ['major/minor', 'styles', 'tone', 'vulgar'] # no critic/reviewer_type/album/artist/name numerical_cols_no_nplays = [x for x in numerical_cols if x != 'nplays'] y = official_competition_dataset['nplays'] y = np.log1p(y) X = official_competition_dataset.drop('nplays', axis=1) X["reviewer_type"].fillna("contributor", inplace=True) X["styles"].fillna("rock", inplace=True) text_features = ['name', 'album', 'artist'] extra_numerical_features = [] sia = SentimentIntensityAnalyzer() for f in text_features: temp = [f'{f}_len', f'{f}_upper', f'{f}_sent_pos', f'{f}_sent_neg'] X[f'{f}_len'] = X[f'{f}'].str.len() X[f'{f}_upper'] = X[f'{f}'].apply(lambda x: len([x for x in x.split() if x.isupper()])) X[f'{f}_sent_pos'] = X[f'{f}'].apply(lambda x: sia.polarity_scores(x)['pos']) X[f'{f}_sent_neg'] = X[f'{f}'].apply(lambda x: sia.polarity_scores(x)['neg']) extra_numerical_features = extra_numerical_features + temp X.drop(text_features, axis=1, inplace=True) X = X[numerical_cols_no_nplays + extra_numerical_features + categorical_features] print(X.head()) cor_matrix = X.corr().abs() upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool)) to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.5)] print(to_drop) numerical_cols_no_nplays_no_highly_correlated_features = [x for x in numerical_cols_no_nplays if x not in to_drop] X = X.drop(to_drop, axis=1) print(X.head()) print(X.columns) print(list(X.iloc[0])) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) global scaler scaler = MinMaxScaler() X_train[numerical_cols_no_nplays_no_highly_correlated_features] = scaler.fit_transform(X_train[numerical_cols_no_nplays_no_highly_correlated_features]) X_test[numerical_cols_no_nplays_no_highly_correlated_features] = scaler.transform(X_test[numerical_cols_no_nplays_no_highly_correlated_features]) print("**********************") print(numerical_cols_no_nplays_no_highly_correlated_features) model = catboost.CatBoostRegressor(depth=10, l2_leaf_reg=5, learning_rate=0.1, cat_features=categorical_features, logging_level="Silent") model.fit(X_train, y_train) pred = model.predict(X_test) n = X_test.shape[0] p = X_test.shape[1] r2 = r2_score(y_test, pred) print('r2 score: ', r2) print('adjusted r2 score: ', 1-(1-r2)*(n-1)/(n-p-1)) print('root mean squared error: ', math.sqrt(mean_squared_error(y_test, pred))) print('mean absolute error: ', mean_absolute_error(y_test, pred)) model.save_model('catboost_regressor')
def get_features(paragraph): sia = SentimentIntensityAnalyzer() sentiment_score = sia.polarity_scores(paragraph) ret = [] ret.append(sentiment_score['neg']) ret.append(sentiment_score['neu']) ret.append(sentiment_score['pos']) ret.append(sentiment_score['compound']) return ret
def get_sentiments(text): l_pos = "pos" l_neg = "neg" l_neut = "neu" vader_analyzer = SentimentIntensityAnalyzer() sentiments = vader_analyzer.polarity_scores(text) return {key: sentiments[key] for key in [l_pos, l_neg, l_neut]}
def polarity(text): """ Output polarity scores for a text using Vader approach. :param text: a text whose polarity has to be evaluated. """ vader_analyzer = SentimentIntensityAnalyzer() return (vader_analyzer.polarity_scores(text))
def sentiment(text): vader_analyzer = SentimentIntensityAnalyzer() output =vader_analyzer.polarity_scores(text) if output['neg']>0.3: return 0,output['neg'] elif output['pos']>0.3: return 1,output['pos'] return 2,output['neu']
def isPositiveMovieReview(review_id: str) -> bool: """True if the average of all sentence compound scores is positive.""" text = nltk.corpus.movie_reviews.raw(review_id) sia = SentimentIntensityAnalyzer() scores = [ sia.polarity_scores(sentence)["compound"] for sentence in nltk.sent_tokenize(text) ] return mean(scores) > 0
def save_articles_with_sentiment(): articles = get_combined_articles() sid = SentimentIntensityAnalyzer() for i, article in enumerate(articles): print(i) ss = sid.polarity_scores(article['text']) article["polarity"] = ss with open('out/articles_train_data_with_sentiment.json', 'w') as fout: json.dump(articles, fout)
def analyze_email_sentiment(email: str) -> bool: ''' analyze the email sentiment, return True if the email has positive compound sentiment False otherwise ''' sia = SentimentIntensityAnalyzer() return sia.polarity_scores(email)["compound"] > 0
def extract_significant_words(review): score = 0 mySentAnalyzer = SentimentIntensityAnalyzer() for word in review: score = mySentAnalyzer.polarity_scores(word)["compound"] if score != 0: if word not in significantWords: significantWords.append(word) significantWordPolarities.append(score)
def intensity_analyser_score(self, tweet): analyser = SentimentIntensityAnalyzer() analysis = analyser.polarity_scores(tweet) if analysis['compound'] >= 0.05: return 'positive' elif analysis['compound'] <= -0.05: return 'negative' else: return 'neutral'
def demo_vader_instance(text): """ Output polarity scores for a text using Vader approach. :param text: a text whose polarity has to be evaluated. """ from nltk.sentiment import SentimentIntensityAnalyzer vader_analyzer = SentimentIntensityAnalyzer() print(vader_analyzer.polarity_scores(text))
def main(): t_start = timer() training_set, content_data = make_training_set() # print(training_set) print_dt('training_set', t_start) # -- t_start = timer() # split the hashtag content now!! for twitter in content_data: if 'hashtag_content' in twitter.keys(): twitter['hashtag_content'] = find_match_word(twitter['hashtag_content'].lower(), training_set) print_dt('split_hash_tag', t_start) # -- t_start = timer() sid = SentimentIntensityAnalyzer() positive_tweets = [] negative_tweets = [] neutral_tweets = [] compound_tweets = [] for twitter in content_data: if 'hashtag_content' in twitter.keys(): temp_content = process_content(twitter['twitter_content'] + ' '.join(twitter['hashtag_content'])) temp_content = ' '.join(temp_content) result = sort_ordered_dict(sid.polarity_scores(temp_content)) if result.keys()[0] == 'pos': positive_tweets.append(temp_content) elif result.keys()[0] == 'neg': negative_tweets.append(temp_content) elif result.keys()[0] == 'neu': neutral_tweets.append(temp_content) elif result.keys()[0] == 'compound': compound_tweets.append(temp_content) else: temp_content = process_content(twitter['twitter_content']) temp_content = ' '.join(temp_content) result = sort_ordered_dict(sid.polarity_scores(temp_content)) if result.keys()[0] == 'pos': positive_tweets.append(temp_content) elif result.keys()[0] == 'neg': negative_tweets.append(temp_content) elif result.keys()[0] == 'neu': neutral_tweets.append(temp_content) elif result.keys()[0] == 'compound': compound_tweets.append(temp_content) print_dt('sentiment_analysis', t_start) print('positive_tweets: ', len(positive_tweets)) print('negative_tweets: ', len(negative_tweets)) print('negative_tweets: ', len(neutral_tweets)) print('negative_tweets: ', len(compound_tweets))
def sentiment_filter(self, text_type): if self.sentiment == 'positive': sentiment_factor = .3 sentiment = 'pos' elif self.sentiment == 'negative': sentiment_factor = .3 sentiment = 'neg' elif self.sentiment == 'neutral': sentiment_factor = .3 sentiment = 'neu' if text_type == 'Speech': text_type = self.corpus_speech elif text_type == 'Tweet': text_type = self.corpus_tweet sentences = sent_tokenize(text_type) sid = SentimentIntensityAnalyzer() for sentence in sentences: ss = sid.polarity_scores(sentence) if ss[sentiment] > sentiment_factor: self.tokens += word_tokenize(sentence)
class Vader_Sentiment: sentiments = ('pos', 'neg', 'neu', 'compound') name = 'Vader' @wait_nltk_data def __init__(self): self.vader = SentimentIntensityAnalyzer() def transform(self, corpus, copy=True): scores = [] for text in corpus.documents: pol_sc = self.vader.polarity_scores(text) scores.append([pol_sc[x] for x in self.sentiments]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [VectorizationComputeValue(shared_cv, col) for col in self.sentiments] if copy: corpus = corpus.copy() corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus
import csv import re import nltk from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * from nltk.sentiment import SentimentIntensityAnalyzer #output file is in the following format '''tweet label positive negative neutral totalpos totalneg totalneu negation hashtag ? * ! capitalized capsPos capsNeg capsNeu''' with open('../data/temppreprocessedTraining.data','rt') as f: reader=csv.reader(f, delimiter='\t') l=list(reader) sid = SentimentIntensityAnalyzer() f = open("../data/featuresTraining.data", 'w+') for row in l: sentiment=row[2] tweet=row[3] tweet=tweet[:-2] ss = sid.polarity_scores(tweet) f.write(tweet+" "+sentiment+" ") #for k in sorted(ss): # print(k, ss[k]) # positive negative and neatral polarities if ss['pos']>0.0:
def demo_vader_tweets(n_instances=None, output=None): """ Classify 10000 positive and negative tweets using Vader approach. :param n_instances: the number of total tweets that have to be classified. :param output: the output file where results have to be reported. """ from collections import defaultdict from nltk.corpus import twitter_samples from nltk.sentiment import SentimentIntensityAnalyzer from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) if n_instances is not None: n_instances = int(n_instances/2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False, limit=n_instances) pos_docs = parse_tweets_set(positive_csv, label='pos') neg_docs = parse_tweets_set(negative_csv, label='neg') # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs vader_analyzer = SentimentIntensityAnalyzer() gold_results = defaultdict(set) test_results = defaultdict(set) acc_gold_results = [] acc_test_results = [] labels = set() num = 0 for i, (text, label) in enumerate(testing_tweets): labels.add(label) gold_results[label].add(i) acc_gold_results.append(label) score = vader_analyzer.polarity_scores(text)['compound'] if score > 0: observed = 'pos' else: observed = 'neg' num += 1 acc_test_results.append(observed) test_results[observed].add(i) metrics_results = {} for label in labels: accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) metrics_results['Accuracy'] = accuracy_score precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results['Precision [{0}]'.format(label)] = precision_score recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) if output: output_markdown(output, Approach='Vader', Dataset='labeled_tweets', Instances=n_instances, Results=metrics_results)
def sentiment(text): vader_analyzer = SentimentIntensityAnalyzer() output =vader_analyzer.polarity_scores(text) return output
def __init__(self): self.vader = SentimentIntensityAnalyzer()
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys from github import Github from nltk.sentiment import SentimentIntensityAnalyzer g = Github("github_username", "github_password") if len(sys.argv) > 1: username = sys.argv[1] repo = sys.argv[2] else: username = input("username: "******"repo: ") sid = SentimentIntensityAnalyzer() for commit in g.get_user(username).get_repo(repo).get_commits(): print(commit.commit.message) ss = sid.polarity_scores(commit.commit.message) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') print()