def getTweets(hashtag): try: posts = api.user_timeline(screen_name=hashtag, count = 10000, lang ="en", tweet_mode="extended") tweetData = [] for post in posts: result = {} result['created_at'],result['Tweet'],result['user'] = str(post.created_at),post.full_text,post.user.name result['screen_name'],result['location'],result['about'] = post.user.screen_name,post.user.location,post.user.description result['followers'],result['following'] = post.user.followers_count,post.user.friends_count result['retweetCount'],result['likeCount'] = post.retweet_count,post.favorite_count result['cleanedTweet'] = clean_tweets(clean(post.full_text)) result['processedTweet'] = clean(post.full_text) blob = TextBlob(str(result['cleanedTweet'])) Sentiment = blob.sentiment polarity,subjectivity = Sentiment.polarity,Sentiment.subjectivity sentimentClass = 'Negative' if polarity < 0 else 'Positive' if polarity > 0 else 'Neutral' result['SentimentScore'],result['Polarity'],result['Subjectivity'] = sentimentClass,polarity,subjectivity tweetData.append(result) tweet_df = pd.DataFrame(tweetData) return tweet_df except Exception as e: print ('Error on line {}'.format(sys.exc_info()[-1].tb_lineno),Exception, e)
def clean_tweet(self, text=None): # emoji_pattern + re.compile if text is None: text = self.text_ tweet = p.clean(text) tweet = re.sub(r':', '', tweet) return tweet
def clean_tweets(df): tempArr = [] for line in df: # send to tweet_processor tmpL = clean(str(line)) # remove puctuation tmpL = REPLACE_NO_SPACE.sub( "", tmpL.lower()) # convert all tweets to lower cases tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL) tempArr.append(tmpL) return tempArr
def preprocessDataFrame(df): """ Function to run the preprocessing pipeline on all tweets to generate the feature "full_text_processed": Translating tweets to English, removing stopwords & lemmatization, removing URLs and reserved words, lowercasing & punctuation removal and VADER sentiment analysis. Parameters ---------- df : DataFrame Transformed DataFrame with original tweets Returns: df : DataFrame DataFrame with processed tweets """ df['full_text_processed'] = df.apply( lambda x: translate_func(x, 'full_text', 'lang'), axis=1) # for some reason some rows are type float so make sure nothing will crash df['full_text_processed'] = df['full_text_processed'].astype(str) api.set_options('urls', 'reserved_words') df['full_text_processed'] = df['full_text_processed'].apply( lambda x: api.clean(x)) df['full_text_processed'] = df['full_text_processed'].apply( lambda x: x.lower()) def remove_punct(text): table = str.maketrans('', '', string.punctuation) return text.translate(table) df['full_text_processed'] = df['full_text_processed'].apply( lambda x: remove_punct(x)) lemmatizer = nltk.stem.WordNetLemmatizer() df['full_text_processed'] = df['full_text_processed'].apply( lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()])) stop_words = set(stopwords.words('english')) df['full_text_processed'] = df['full_text_processed'].apply( lambda x: ' '.join( [word for word in x.split() if word not in stop_words])) sid = SentimentIntensityAnalyzer() def create_sentiment(x, text): return sid.polarity_scores(text)['compound'] # add sentiment as part of preprocessing df['sentiment'] = df.apply( lambda x: create_sentiment(x, x['full_text_processed']), axis=1) return df
def data_preprocessing (tweets_raw): clean_text = clean(tweets_raw) #print("### Info: Cleaning the data...", end='') #tweets_clean = pd.DataFrame(columns=["Date","Location", "Text"]) """ for i in range(tweets_raw.shape[0]): clean_text = clean(tweets_raw["Text"][i]) tweets_raw["Text"][i] = clean_text """ #print("DONE!") return tweets_raw
def clean_pipeline(text): ''' Function for apply. Performs preprocess.api clean function to raw text. args: -text: string. ''' toret = unicode_to_ascii(text) toret = remove_punctuation(toret) toret = clean(toret) toret = toret.lower() # All lowercase toret = remove_urls(toret) toret = remove_double_whitespaces(toret) # Remove double spaces toret = remove_lspace(toret) # Remove leading space toret = remove_rspace(toret) return toret
def clean(self, text): """ Parses and cleans a tweet :param: Text of the tweet :return: A cleaned tweet TODO: Update data cleaning function """ if text is None: text = self.text_ # Remove punctuation, username, other elements from text text = text.translate(str.maketrans('','',string.punctuation)) if re.match(r'^([RT])\w', text): tokens = text.split(' ') text = ' '.join(tokens[2:]) tweet = p.clean(text) tweet = re.sub(r':', '', tweet) return tweet.lower()
train.head(10) train = train[['selected_text', 'sentiment']] train.head() train['sentiment'].unique() train.groupby('sentiment').nunique() train["selected_text"].fillna("No content", inplace=True) temp = [] #Splitting pd.Series to list data_to_list = train['selected_text'].values.tolist() for i in range(len(data_to_list)): temp.append(clean(data_to_list[i])) list(temp[:5]) def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True) ) # deacc=True removes punctuations data_words = list(sent_to_words(temp)) print(data_words[:10]) len(data_words)
public_tweets = tweepy.Cursor(api.search, q=search_term, lang="en").items(no_of_search_items) index = 0 if os.path.isfile('./tweetbykeyword.csv'): my_csv_file = open('tweetbykeyword.csv', 'r+') reader = csv.DictReader(my_csv_file) field_names = ['Index', 'Keyword', 'Tweets'] for each_row in reader: if search_term == each_row['Keyword']: index += 1 writer = csv.DictWriter(my_csv_file, fieldnames=field_names) else: my_csv_file = open('tweetbykeyword.csv', 'w') field_names = ['Index', 'Keyword', 'Tweets'] writer = csv.DictWriter(my_csv_file, fieldnames=field_names) writer.writeheader() for each_tweet in public_tweets: data = p.clean(each_tweet.text) data = data.encode('utf-8') data = data.decode('unicode_escape') writer.writerow({'Index': index, 'Keyword': search_term, \ 'Tweets': data}) index += 1 elif int(user_input) == 2: loop = False else: print('Please enter 1 or 2')
def get_stock_sentiment(stock_symbol): symbol = stock_symbol.split(".")[0].lower() + " stock" auth = tweepy.OAuthHandler(CONSUMER_KEY, CUSTOMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_TOKEN) user = tweepy.API(auth) tweets = tweepy.Cursor(user.search, q=str(symbol), tweet_mode='extended', lang='en',exclude_replies=True).items(NUM_OF_TWEETS) tweet_list = [] #List of tweets alongside polarity global_polarity = 0 #Polarity of all tweets === Sum of polarities of individual tweets tw_list=[] #List of tweets only => to be displayed on web page #Count Positive, Negative to plot pie chart pos=0 #Num of pos tweets neg=1 #Num of negative tweets for tweet in tweets: count=20 #Num of tweets to be displayed on web page #Convert to Textblob format for assigning polarity tw2 = tweet.full_text tw = tweet.full_text #Clean tw=clean(tw) #print("-------------------------------CLEANED TWEET-----------------------------") #print(tw) #Replace & by & tw=re.sub('&','&',tw) #Remove : tw=re.sub(':','',tw) #print("-------------------------------TWEET AFTER REGEX MATCHING-----------------------------") #print(tw) #Remove Emojis and Hindi Characters tw=tw.encode('ascii', 'ignore').decode('ascii') #print("-------------------------------TWEET AFTER REMOVING NON ASCII CHARS-----------------------------") #print(tw) blob = TextBlob(tw) polarity = 0 #Polarity of single individual tweet for sentence in blob.sentences: polarity += sentence.sentiment.polarity if polarity>0: pos=pos+1 if polarity<0: neg=neg+1 global_polarity += sentence.sentiment.polarity if count > 0: tw_list.append(tw2) #tweet_list.append(Tweet(tw, polarity)) count=count-1 try: global_polarity = global_polarity / len(tw_list) except: pass neutral=NUM_OF_TWEETS-pos-neg if neutral<0: neg=neg+neutral neutral=20 labels=['Positive','Negative','Neutral'] sizes = [pos,neg,neutral] explode = (0, 0, 0) fig = plt.figure(figsize=(7.2,4.8),dpi=65) fig1, ax1 = plt.subplots(figsize=(7.2,4.8),dpi=65) ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle ax1.axis('equal') plt.tight_layout() plt.savefig("static/"+stock_symbol+"_sa"+'.png') plt.close(fig) #plt.show() if pos-neg > 10: suggestion = " According to sentiment analysis, stock price may RISE" tw_pol = " Overall Positive" elif neg-pos > 10: suggestion = " According to sentiment analysis, stock price may FALL" tw_pol = " Overall Negative" else: suggestion = " According to sentiment analysis, there wont be much volatilty in the stock price" tw_pol = " Almost neautral " return suggestion, tw_list, tw_pol
def clean_text(val): val = p.clean(val) val = re.sub('@(\w+)','',val) val = re.sub('#',"", val) val = punctuation(val) return val
from keras.layers.embeddings import Embedding from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D print(tf.__version__) df = pd.read_csv("text_emotion.csv") df.head() ## Step 1: Data pre-processing ## Remove mentions and "#" symbol in tweet df['clean_content'] = df.content.apply(lambda x: re.sub('@(\w+)', '', x)) df['clean_content'] = df.clean_content.apply(lambda x: re.sub('#', "", x)) ## Clean using the tweet-processing package, removing emojis and urls df['clean_content'] = df.clean_content.apply(lambda x: p.clean(x)) ## Remove unnecessary punctuation in the data, but tag ! and ? def punctuation(val): punctuations = '''()-[]{};:'"\,<>./@#$%^&_~''' for x in val.lower(): if x in punctuations: val = val.replace(x, " ") elif x == "!": val = val.replace(x, " XXEXLMARK ") elif x == "?": val = val.replace(x, " XXQUESMARK ") return val
import os import tweepy as tw import pandas as pd import preprocessor.api as p from preprocessor.api import clean, tokenize, parse consumer_key = 'AAAAAAAAAAAAAAAAAAAAAAAA' consumer_secret = 'BBBBBBBBBBBBBBBBBBBBBBBBB' access_token = 'CCCCCCCCCCCCCCCCCCCCCCCC' access_token_secret = 'DDDDDDDDDDDDDDDDDDDDDDD' # perform authentication auth = tw.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) # Create our twitter api to access tweets from it api = tw.API(auth) search_words = "Korea" date_since = "2019-11-16" # Collect tweets tweets = tw.Cursor(api.search, q=search_words, lang="en", since=date_since).items(5) # Iterate and print tweets for tweet in tweets: print(p.clean(tweet.text))
def write_tweets(keyword, file): # If the file exists, then read the existing data from the CSV file. if os.path.exists(file): df = pd.read_csv(file, header=0) else: df = pd.DataFrame(columns=COLS) #page attribute in tweepy.cursor and iteration for page in tweepy.Cursor(api.search, q=keyword, count=200, include_rts=False, since=start_date, tweet_mode="extended").pages(100): for status in page: new_entry = [] status = status._json #when run the code, below code replaces the retweet amount and #no of favorires that are changed since last download. if status['created_at'] in df['created_at'].values: i = df.loc[df['created_at'] == status['created_at']].index[0] if status['favorite_count'] != df.at[i, 'favorite_count'] or \ status['retweet_count'] != df.at[i, 'retweet_count']: df.at[i, 'favorite_count'] = status['favorite_count'] df.at[i, 'retweet_count'] = status['retweet_count'] continue #tweepy preprocessing called for basic preprocessing clean_text = clean(status['full_text']) #call clean_tweet method for extra preprocessing filtered_tweet=clean_tweets(clean_text) #new entry append new_entry += [status['id'], status['created_at'], status['source'], status['full_text'],filtered_tweet, status['lang'], status['favorite_count'], status['retweet_count']] #to append original author of the tweet new_entry.append(status['user']['screen_name']) try: is_sensitive = status['possibly_sensitive'] except KeyError: is_sensitive = None new_entry.append(is_sensitive) # hashtagas and mentiones are saved using comma separted hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']]) new_entry.append(hashtags) mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']]) new_entry.append(mentions) #get location of the tweet if possible try: location = status['user']['location'] except TypeError: location = '' new_entry.append(location) try: coordinates = [coord for loc in status['place']['bounding_box']['coordinates'] for coord in loc] except TypeError: coordinates = None new_entry.append(coordinates) single_tweet_df = pd.DataFrame([new_entry], columns=COLS) df = df.append(single_tweet_df, ignore_index=True) csvFile = open(file, 'a' ,encoding='utf-8') df.to_csv(csvFile, mode='a', columns=COLS, index=False, encoding="utf-8")