def get_textblob_sentiments(df, text_column): """ Get sentiments for each row in text_column of dataframe using spaCyTextBlob Append all polarity scores to a list Input: - df: dataframe with each doc in a row - text_column: column name in df of texts Returns: - sentiment_scores: list of polarity scores """ # Load spacy model spacy_nlp = spacy.load("en_core_web_sm") # Initialise TextBlob and add to pipe spacy_text_blob = SpacyTextBlob() spacy_nlp.add_pipe(spacy_text_blob) # Create empty target list to store sentiment scores sentiment_scores = [] # Apply sentiment analysis to each headline in the data for doc in spacy_nlp.pipe(tqdm(df[text_column]), batch_size=500): # Retrieve polarity score sentiment = doc._.sentiment.polarity # Append sentiment score to target list sentiment_scores.append(sentiment) return sentiment_scores
def sentimenter(text): """ Function that given a list with sentences or tweets returns a list with two numbers for each sentences representing the polarity (score as a float within the range [-1.0, 1.0]) and subjectivity (that is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective). Parameters ------------- text : (list) list where each element is a sentence or tweet. Returns ------------- (DataFrame) two-column data frame with polarity and subjectivity. Example ------------- example = ["Contact me at [email protected]", "@vcuspinera my webpage is https://vcuspinera.github.io"] preprocess(example) (output:) ['contact me at', 'my webpage is'] """ nlp = spacy.load('en_core_web_sm') spacy_text_blob = SpacyTextBlob() nlp.add_pipe(spacy_text_blob) result = [[nlp(tw)._.sentiment.polarity, nlp(tw)._.sentiment.subjectivity] for tw in text] result = pd.DataFrame(result).rename(columns={ 0: 'polarity', 1: 'subjectivity' }) return result
def main(): # tell argparse the arguments you'd like to use - including creating a subset for faster processing parser = argparse.ArgumentParser() parser.add_argument( "-d", "--path", required=True, help="The path to directory of files") #The input option parser.add_argument( "-s", "--subset", required=False, help="The subset option") #The subset option, required is F here args = vars(parser.parse_args()) # create a data variable with the csv files in the path defined above data = pd.read_csv(args["path"]) # check whether the subset has been requested - if so, slice the data if args["subset"] is not None: slice_index = int(args["subset"]) data = data[:slice_index] # create a directory to store the plots (if it doesn't yet exist) if not os.path.exists("sentiment_output"): os.mkdir("sentiment_output") # Then call and initialise spacy, TextBlob, and your nlp pipe nlp = spacy.load("en_core_web_sm") #We're using the English small library spacy_text_blob = SpacyTextBlob() #This is taken from the spacy website nlp.add_pipe(spacy_text_blob) ##Now we're set up, we want to programe the calculation of scores (we'll use batches of 5000)## # message print("\nHold on, we're calculating the sentiment scores...") # create an empty list of sentiment scores for every headline, we'll call this sentiment_tracker sentiment_tracker = [] # for every headline in data frame (we're looking at docs, not sentences) for doc in nlp.pipe(data["headline_text"], batch_size=5000): # calculate the sentiment of the doc (headline) sentiment = doc._.sentiment.polarity # append this to sentiment_tracker list sentiment_tracker.append(sentiment) # append the sentiment_tracker list to the dataframe and save as output csv file in sentiment_plots data.insert(len(data.columns), "sentiment", sentiment_tracker) output_csv_path = os.path.join("sentiment_output", "sentiment_tracker.csv") data.to_csv(output_csv_path, index=False) ##Now we can put it together to create rolling mean plots## # message print( "We've calculated the sentiment scores, now we'll generate the plots..." ) # First, create a sentiment dataframe with date as the index and sentiment scores to calculate means based on date df_sentiment = pd.DataFrame( {"sentiment": sentiment_scores }, # create a column to hold sentiment scores index=pd.to_datetime( data["publish_date"], format='%Y%m%d', errors='ignore')) # index the date using to_datetime # apply the smoothing plot function from above, to create and save plots in output smoothed_sentiment_plot("7d", "1-week", df_sentiment) # 1-week average smoothed_sentiment_plot("30d", "1-month", df_sentiment) # 1 month average # Print a message to let you know when you're done print( "That's you complete - woohoo! The csv file and plots are in output directory.\n " )
def init(df,param): # Load English parser and text blob (for sentiment analysis) model = spacy.load('en_core_web_sm') spacy_text_blob = SpacyTextBlob() model.add_pipe(spacy_text_blob) return model
from spacytextblob.spacytextblob import SpacyTextBlob import lemminflect from collections import Counter, defaultdict from spacy_langdetect import LanguageDetector from spacymoji import Emoji import pandas as pd import numpy as np import nltk from nltk.corpus import stopwords # from afinn import Afinn import re import json nlp = spacy.load('en_core_web_md') nlp.add_pipe(Emoji(nlp, merge_spans=False), first=True) nlp.add_pipe(SpacyTextBlob()) nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Comment out for quick summary from summarizer import Summarizer model = Summarizer() # afinn = Afinn() def get_summary(raw_text, category_list=[]): # file = 'raw_text.txt' # with open(file) as f: # raw_text = html2text(f.read())