def main(): start = time.time() # If output file does not exist, create it and provide it with the column names if not os.path.exists(file_output): with open(file_output, mode="w", newline='') as output: writer = csv.writer(output) writer.writerow(new_column_names) # Analyze the title of the show and make note of whether or not it is sentiment neutral neutral_title_mapping = {} with open('identifiers.csv', mode='r') as show_file: reader = csv.reader(show_file) for row in reader: show = row[0] sia = SIA() pol_score = sia.polarity_scores(show) # If show title isn't neutral, provide it with neutral one if float(pol_score['neu']) == 1.0: neutral_title_mapping[show] = True else: neutral_title_mapping[show] = False # Read the input file in 10000 record sized chunks, and for each tweet # calculate the sentiment, and append the results to the output file with open(file_input, encoding="utf8") as csv_file, open(file_output, 'a', newline='') as output: writer = csv.writer(output) for data_chunk in pd.read_csv(csv_file, chunksize=10000): for record in data_chunk.itertuples(index=True, name='Pandas'): show = getattr(record, 'category') text = getattr(record, 'text') # Substitute placeholder for the title if neutral_title_mapping[show] == False: text = re.sub(show.lower(), 'PLACEHOLDER', text) print(show + ' ' + text) sia = SIA() results = sia.polarity_scores(text) columns = [ show, getattr(record, 'status_id'), getattr(record, 'in_reply_to_id'), getattr(record, 'user_id'), getattr(record, 'text'), getattr(record, 'language'), getattr(record, 'created_at'), getattr(record, 'location'), getattr(record, 'verified'), "Strong" ] writer.writerow(columns + [ results['neg'], results['neu'], results['pos'], results['compound'] ]) end = time.time() print("Finished in %d seconds" % (end - start))
def vader(sentence): # VADER (Valence Aware Dictionary and sEntiment Reasoner) - https://github.com/cjhutto/vaderSentiment # Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media # Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. sid = SIA() ss = sid.polarity_scores(sentence) return [ss['neg'], ss['neu'], ss['pos'], ss['compound']]
def sentiment_analyser(self, search_query): """ Runs a Google News Search on the input string and then uses VADER sentiment analysis engine on each returned headline. Input: Search Query String Output: DataFrame with compound sentiment score for each news article """ # Create a Covid-19 News DataFrame for each organization of interest news_df = self.covid19_news_scraper(search_query) # Initialize VADER Sentiment Intensity Analyzer sia = SIA() results = [] # Calculate the polarity score for each headline associated with the organization for row in news_df['Headline']: pol_score = sia.polarity_scores(row) pol_score['Headline'] = row results.append(pol_score) # Create the Sentiment DataFrame sent_df = pd.DataFrame.from_records(results) # Merge the two dataframes together on the 'Headline' column merge_df = news_df.merge(sent_df, on='Headline') # Re-order and Rename the columns merge_df = merge_df.rename(columns={'compound':'VADER Score'}) col_order = ['Client','Date','Headline','Source','VADER Score','neg','neu','pos'] print('Completed processing %s' % search_query, "...") return merge_df[col_order]
def get_sentimentSubreddit(subreddit): subreddit = reddit.subreddit('{}'.format(subreddit)) hot_btc = subreddit.hot(limit=None) headlines = set() for submission in hot_btc: headlines.add(submission.title) sia = SIA() results = [] for line in headlines: pol_score = sia.polarity_scores(line) pol_score['headline'] = line results.append(pol_score) df = pd.DataFrame.from_records(results) df['label'] = 0 df.loc[df['compound'] > 0.2, 'label'] = 1 df.loc[df['compound'] < -0.2, 'label'] = -1 # print(df.head(5)) return df
def Attribute(textList): attJson = readJson('./attribute.json') attDict = {} resultDict = {} sia = SIA() for i in textList: for j in attJson: for k in attJson[j]: if k in i: if j in resultDict: tmp = sia.polarity_scores(i) for l in tmp.keys(): resultDict[j][l] = resultDict[j][l] + tmp[l] resultDict[j]['Total'] = resultDict[j]['Total'] + 1 else: resultDict[j] = sia.polarity_scores(i) resultDict[j]['Total'] = 1 if j in attDict: attDict[j] = attDict[j] + 1 else: attDict[j] = 1 break for i in list(resultDict.keys()): for j in list(resultDict[i].keys())[0:3]: resultDict[i][j] = resultDict[i][j] / resultDict[i]['Total'] return attDict, resultDict
def output_csv_files(output_dfs, output_df_strs): assert (len(output_dfs) == len(output_df_strs)) sia = SIA() for index in range(len(output_dfs)): preprocsplit = (lambda rev: preprocess_string(str(rev))) output_df, stem_name = output_dfs[index], output_df_strs[index] output_df[2] = (output_df[1].astype(str).apply(preprocsplit)) new_output_df = pd.DataFrame() #new output dataframe... for (_, row) in output_df.iterrows(): if (('unprofession' in list(row[2]) or 'profession' in list(row[2]))): new_output_df = new_output_df.append(row, ignore_index=True) misclassified_df = pd.DataFrame() for (_, row) in new_output_df.iterrows(): if (('unprofession' in list(row[2]) and sia.polarity_scores(row[0])['compound'] > 0.0) or ('profession' in list(row[2]) and sia.polarity_scores(row[0])['compound'] < 0.0)): misclassified_df = misclassified_df.append(row, ignore_index=True) f_name = 'misclassified_' + stem_name + '.csv' print(type(misclassified_df)) print(f_name) misclassified_df.to_csv(f_name) return
def reddit_analysis(): # nltk.download('vader_lexicon') reddit = praw.Reddit(client_id='7jUlWSWelE0zpg', client_secret='V8W2JZJs05cYOFCRb6oMTN7TXvY', user_agent='cyberdhirendra') headlines = set() for submission in reddit.subreddit('Amazon').new(limit=None): headlines.add(submission.title) display.clear_output() from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA sia = SIA() results = [] for line in headlines: pol_score = sia.polarity_scores(line) pol_score['headline'] = line results.append(pol_score) dtf = pd.DataFrame.from_records(results) dtf['label'] = 0 dtf.loc[dtf['compound'] > 0.2, 'label'] = 1 dtf.loc[dtf['compound'] < -0.2, 'label'] = -1 counts = dtf.label.value_counts(normalize=True) * 100 return counts[-1]
def derive_columns(data_frame): # based on cols from data after cleaning data_frame['Feedback'] = data_frame['Positive Feedback'].map( str) + data_frame['Negative Feedback'].map(str) print('Finding mentioned sites and brands...') data_frame['Sites'] = data_frame.apply(mentioned_site, axis=1) data_frame['Sites2'] = data_frame.apply(convert_to_list, axis=1) # TEMPORARY COLUMN data_frame['Brands'] = data_frame.apply(mentioned_brand, axis=1) print('Categorizing feedback into issue types...') data_frame = data_frame.merge(df['Feedback'].apply(lambda s: pd.Series( {'Issues': [k for k, v in WTI.items() if v.search(s)]})), left_index=True, right_index=True) print('Categorizing feedback into component types...') data_frame = data_frame.merge(df['Feedback'].apply(lambda s: pd.Series( {'Components': [k for k, v in WTC.items() if v.search(s)]})), left_index=True, right_index=True) sid = SIA() print('Applying sentiment analysis to feedback...') data_frame = data_frame.merge(df['Feedback'].apply( lambda s: pd.Series({'compound': evalSentences(sid, s)[0]})), left_index=True, right_index=True) return data_frame
def sent_n(type_t): # "Negative" ter = pd.DataFrame() ter["int"] = type_df[type_t]["Interview"] ter["Interview Date"] = type_df[type_t]["Interview Date"] ter["positive"] = 0 ter["negative"] = 0 ter["easy"] = 0 ter["neutral"] = 0 ter["difficult"] = 0 ter["compound"] = 0.0 ter = ter.iloc[:-5, :] analyzer = SIA() for sentence, row in zip(ter["int"], list(range(ter["int"].shape[0]))): vs = analyzer.polarity_scores(sentence) ter["compound"][row] = float(vs["compound"]) if vs["compound"] < 0.2: ter["negative"][row] = 1 elif vs["compound"] > 0.5: ter["positive"][row] = 1 else: ter["neutral"][row] = 1 # print("{:-<65} {}".format(sentence, str(vs))) bad = ter[ter["negative"] == 1] bad = bad.sort_values("Interview Date", ascending=False) return bad
def acquire_feed_data(feed_name, url, hash_key_array): ''' This method acquires data from a RSS URL link. News which are new are inserted into the database ''' # get the feed data from the url feed = feedparser.parse(url) for post in feed.entries: # if post is already in the database, skip it #Get post attributes (Tite, pubdate, descriptopn...ect) from the news feed title = post.title #.encode('utf-8') comment = "COMMENTS ARE SUPRESSED" #post.comments link = post.link #.encode('utf-8') pubdate = str(parse(post.published).strftime('%Y-%m-%d %H:%M:%S')) description = str(post.description.encode("utf=8")) hash_key = hashlib.md5(''.join((title, pubdate)).encode()).hexdigest() #if not check_new_hash_key_exists(hash_key): if not hash_key in hash_key_array: #posts_to_print.append(row) #initialize sentimental analysis class sia = SIA() #apply sentimental anaylysis algorithm on news Title and derive positve/negative score sentimental_score = sia.polarity_scores(title) pos_score = str(sentimental_score["pos"]) neg_score = str(sentimental_score["neg"]) #prepare row to be inserted inserted in DB row = (u' '.join( (hash_key, "|", feed_name, "|", title, "|", link, "|", comment, "|", description, "|", pubdate, "|", pos_score, "|", neg_score, "\n"))).encode('utf-8') #insert row into database log_news_in_db(hash_key, feed_name, title.replace("'", ""), link.replace("'", ""), comment.replace("'", ""), description, pubdate, pos_score, neg_score)
def __init__(self): self._sent_analyzer = SIA() self._word_tokenizer = WordPunctTokenizer().tokenize self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle').tokenize self._ids = []
def get_rss(ticker): feed = f'https://www.nasdaq.com/feed/rssoutbound?symbol={ticker}' rss = feedparser.parse(feed) articles = rss.entries summaries = list() for article in articles: entry = dict() text = strip_html(article['summary']).replace('\n', '') if len(text) > 65: entry['text'] = text entry['time'] = article['published'] entry['symbols'] = article['nasdaq_tickers'].split(',') summaries.append(entry) sia = SIA() sentiments = list() for summary in summaries: score = sia.polarity_scores(summary['text']) score['text'] = summary['text'] time = ' '.join(summary['time'].split(' ')[1:4]) score['time'] = datetime.strptime(time, '%d %b %Y') score['tickers'] = summary['symbols'] sentiments.append(score) sentiment_df = pd.DataFrame.from_records(sentiments) return sentiment_df
def ColumnScoring(column): sentiments = [] for entry in column: sid = SIA() sentiment = sid.polarity_scores(entry) ['pos'] sentiments.append(sentiment) return sentiments
def get_sentiment_score(self, comment_text): # Use sentiment intensity analyzer and store value in sentiment_score sia = SIA() sentiment_score = sia.polarity_scores(comment_text)['compound'] return sentiment_score
def main(): test_df = load_test_data() test_data = test_df['HEADLINE'].tolist() check_test = test_df['SENTIMENT'].tolist() #nltk.download('vader_lexicon') sia = SIA() results = [] for line in test_data: pol_score = sia.polarity_scores(line) results.append(pol_score) nltk_pred = [] for line in results: if line['compound'] > 0: nltk_pred.append(1) else: nltk_pred.append(-1) print('NLTK F1 Score: [6] ', round(f1_score(check_test, nltk_pred, average='micro'), 3)) final1 = nltk_pred correct = 0 total = len(final1) for i in range(len(final1)): if final1[i] == check_test[i]: correct += 1 print('Acc:', correct / total)
def analyze_sentiment(comment_body): """ Calculate the average sentiment of one comment's body text. The function calculates the compound polarity score using VADER for each word in the comment text and finds the average score for the comment. Args: comment_body: List of strings representing the text of the comment. Returns: A float representing the average compound polarity score for the comment. """ sia = SIA() results = [] for sentence in comment_body: pol_score = sia.polarity_scores(sentence) results.append(pol_score['compound']) try: return sum(results) / len(results) except ZeroDivisionError: return 0
def listToSentiment(theList): sia = SIA() sentimentList = [] for i in range(len(theList)): pol_score = sia.polarity_scores(theList[i]) sentimentList.append(pol_score["compound"]) return sentimentList
def get_sentiment_of_sentence(sentence): # call for a sentiment analysis, article can be one or mroe sentence sia = SIA() sentiment_scores = sia.polarity_scores(sentence.lower()) # store the sentiment score in MongoDB for this article if it is not there, # else return the existing sentiment score return sentiment_scores["compound"] # only get the compound score
def ticker_sentiment(ticker_name): sent_dic = {} sia = SIA() results = [] ticker = yf.Ticker(ticker_name) tickername = ticker.info['longName'] page = requests.get('https://news.google.com/rss/search?q=' + tickername) tree = html.fromstring(page.content) date = tree.xpath('//pubdate/text()') title = tree.xpath('//title/text()') for line in title: pol_score = sia.polarity_scores(line) pol_score['headline'] = line results.append(pol_score) sent_df = pd.DataFrame.from_records(results) sent_df['polarity_score'] = sent_df['headline'].apply(lambda tweet: TextBlob(tweet).sentiment) sent_dic['Negative Score'] = sent_df.sum(axis = 0, skipna = True)['neg'] sent_dic['Positive Score'] = sent_df.sum(axis = 0, skipna = True)['pos'] return jsonify(sent_dic)
def ingest_news(insource, inlink, inheader): ''' Ingest news articles into DB parameter(s): news article source name (str), news site URL (str), HTML header type for article titles (str) returns: void. ''' conn = sqlite3.connect('../data/sentiment.db') cursor = conn.cursor() #Setup sentiment capture. sia = SIA() senti = [] #Call scraper. all_articles = get_news(inlink, inheader) #Insert scraped data and associated sentiment. for article in all_articles: score = sia.polarity_scores(article) cursor.execute("INSERT INTO Generalnews(source, title, date, sentiment) VALUES(?,?,?,?)", (insource, article, timestamp, float(score.get('compound')))) #Commit db changes and close db. conn.commit() conn.close()
def polarity(row): #Does the sentiment analysis stuff vader_path = os.getcwd( ) + '/nltk_data/sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt' sia = SIA(lexicon_file=vader_path) polarity = sia.polarity_scores(row) return polarity['compound']
def get_sentiment_analysis(cursor: list): if not cursor: return [] sia = SIA() positive = 0 negative = 0 neutral = 0 overall = 0 for tweet in cursor: res = sia.polarity_scores(tweet["semi_processed_text"]) if res["compound"] < -0.25: negative += 1 elif res["compound"] > 0.25: positive += 1 else: neutral += 1 if positive >= negative and positive >= neutral: overall = 1 elif neutral >= positive and neutral >= negative: overall = 0 elif negative >= positive and negative >= neutral: overall = -1 return [("Positive", positive * 100 / len(cursor)), ("Neutral", neutral * 100 / len(cursor)), ("Negative", negative * 100 / len(cursor))], overall
def analyze_sentiment(tweet): '''Utility function to classify the polarity of a tweet using textblob''' tweet = tweet.replace('#', '') tweet = tweet.replace('-', ' ') negative_words = [ 'genocide', 'apartheid', 'warcrimes', 'occupier', 'occupied', 'occupation', 'boycott', 'freepalestine', 'jewhate', 'suppression', 'oppression', 'settler', 'settlers', 'settlements', 'settlement', 'boycottisrael', 'justice', 'infiltrated', 'justice4palestine', 'stopisraeliapartheid', 'robbers', 'stole', 'stoparmingsrael', 'preach', 'sanction', 'sanctions' ] positive_words = [ 'loveisrael', 'supportisrael', 'rockets', 'rocket', 'innovation', 'startupnation', 'support', 'aid' ] sid = SIA() for word in negative_words: if word == 'genocide' or word == 'apartheid': sid.lexicon[word] = -2 else: sid.lexicon[word] = -1 for word in positive_words: sid.lexicon[word] = 1 ss = sid.polarity_scores(tweet.lower()) # taking the polarity return ss['compound']
def get_sentiment(self, col="processed_text"): """Find compound sentiment score""" sia = SIA() self.df["score"] = self.df[col].apply( lambda x: sia.polarity_scores(str(x))) self.df["score"] = self.df["score"].apply(lambda x: x["compound"]) return self
def get_sentiment(formatted_text): CRYPTO_LEXICON = json.load( open("../SentimentTrader/Words/crypto_lexicon.json")) sia = SIA() sia.lexicon.update(CRYPTO_LEXICON) return sia.polarity_scores(formatted_text)
def process_tweets_from_file(fin, fout): valid_count = 0 invalid_count = 0 sia = SIA() list_of_tweets = [] with open(fin) as f: for line in f: j = json.loads(line) try: # validate date format created_at = datetime.strptime(j['created_at'], '%Y-%m-%dT%H:%M:%S') # text formatting formatted_text = format.format_full(j['text']) j['formatted_text'] = formatted_text # sentiment sent = sia.polarity_scores(formatted_text) j['sentiment'] = sent list_of_tweets.append(json.dumps(j)) valid_count += 1 if (valid_count % 25000 == 0): print(valid_count) except ValueError: invalid_count += 1 if (invalid_count % 100 == 0): print('Invalid:' + str(created_at), j['created_at']) continue with open(fout, 'w') as f: for tweet in list_of_tweets: f.write(tweet + '\n')
def process_tweets_from_file(fin, fout): valid_count = 0 invalid_count = 0 sia = SIA() list_of_tweets = [] reader = csv.reader(open('/home/amit/Desktop/Sentiment_analysis/Bitcoin-price-prediction-model-master/coindesk-bpi-USD-close_data-2010-07-18_2018-06-07.csv', 'r')) d = {} for row in reader: k, v = row print(k) k = datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S').date() #print(k) d[k] = v print(d) with open(fin) as f: for line in f: j = json.loads(line) try: # validate date format created_at = datetime.datetime.strptime(j['created_at'], '%Y-%m-%dT%H:%M:%S') #sprint(type(created_at)) this_date = created_at.date() print(this_date) next_date = this_date + datetime.timedelta(days=1) j['price'] = 0 if(next_date > datetime.datetime.today().date()): j['price'] = d[this_date] else: j['price'] = d[next_date] # datetime.strptime('2016-10-27 22:58:14', '%Y-%m-%d %H:%M:%S') # text formatting formatted_text = format.format_full(j['text']) j['formatted_text'] = formatted_text # sentiment sent = sia.polarity_scores(formatted_text) j['sentiment'] = sent list_of_tweets.append(json.dumps(j)) valid_count += 1 if (valid_count % 25000 == 0): print(valid_count) except ValueError: invalid_count += 1 if (invalid_count % 100 == 0): print('Invalid:' + str(created_at), j['created_at']) continue with open(fout, 'w') as f: for tweet in list_of_tweets: f.write(tweet+'\n') print("Successfully processed", len(list_of_tweets), "tweets")
def sentiment_analysis(text, full_score=False): sia = SIA() score = sia.polarity_scores(text) if full_score: return score else: x = score['neu'] * 5 - abs(score['neg'] - score['pos']) return 1 / (1 + math.exp(-x))
def nltk_sentiment(text): """ Does sentiment analysis on a string of text and return the polarity scores of the result. pol_scores is an array with pos, neg, neu, and composite scores. """ pol_score = SIA().polarity_scores(text) return pol_score
def __init__(self, filePath): #Inherents from HTMLParser class and initialize self.filePath = filePath self.dataFrame = pd.read_excel(self.filePath) self.stem = SnowballStemmer("english") self.sia = SIA() self.strict = False self.convert_charrefs = True self.fed = []