'''tweets_df = pd.read_csv('data/tweets_1600000.csv', encoding = 'latin') tweets_df.columns = ['sentiment','id','time','query','name','tweet'] tweets_df = tweets_df[['tweet','sentiment']] tweets_df['clean_tweet'] = clean.clean(tweets_df['tweet']) tweets_df['clean_tweet'] = tweets_df['clean_tweet'].apply(lambda x: clean.tokenize(x)) docs2 = tweets_df['clean_tweet'] t2 = Tokenizer() t2.fit_on_texts(docs2) vocab_size2 = len(t2.word_index) + 1 #encode the documents encoded_docs2 = t2.texts_to_sequences(docs2)''' clean = CleanText() #clean() removes urls, emoticons and hashtags tweets['text'] = clean.clean(tweets['text']) #remove punctuations, stopwords, lemmatize and splits the sentences into tokens tweets['text'] = tweets['text'].apply(lambda x: clean.tokenize(x)) docs = tweets['text'] labels = tweets['sentiment'] le = LabelEncoder() labels_en = le.fit_transform(labels) #Negative: 0, Positive: 1 labels_en = keras.utils.to_categorical(np.asarray(labels_en)) #tokenizer t = Tokenizer() t.fit_on_texts(docs) vocab_size = len(t.word_index) + 1 #encode the documents encoded_docs = t.texts_to_sequences(docs)
import keras from keras.models import Model, Sequential from keras.layers import LSTM from keras.layers import Flatten, Dense, Dropout, Activation, Input, BatchNormalization from keras.optimizers import Adam from keras.layers.embeddings import Embedding from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences tweets = pd.read_csv('Tweets.csv') tweets = tweets[['text', 'airline_sentiment']] clean = CleanText() tweets['text'] = tweets['text'].apply(lambda x: clean.clean(x)) docs = tweets['text'] labels = tweets['airline_sentiment'] le = LabelEncoder() labels_en = le.fit_transform(labels) #Neutral: 1, Positive: 2, Negative: 0 labels_en = keras.utils.to_categorical(np.asarray(labels_en)) #tokenizer t = Tokenizer() t.fit_on_texts(docs) vocab_size = len(t.word_index) + 1 #encode the documents encoded_docs = t.texts_to_sequences(docs)
st.title("Sentiment Analysis of Tweets") date = st.sidebar.date_input('Enter Date Range:',[datetime.date(2019, 7, 6), datetime.date(2019, 7, 8)]) limit = st.sidebar.slider('Enter number of Tweets to scrape:',0,1000) lang = 'english' if st.button('Scrape Tweets'): with st.spinner('Scraping Tweets...'): tweets = query_tweets('videogames', begindate = date[0], enddate = date[1], limit = limit, lang = lang) df = pd.DataFrame(t.__dict__ for t in tweets) df = df[['timestamp','text','likes','retweets']] df = df.drop_duplicates(subset=['likes']) clean = CleanText() df['clean_text'] = clean.clean(df['text']) df['clean_text'] = df['clean_text'].apply(lambda x: clean.tokenize(x)) docs = df['clean_text'] #tokenizer t = Tokenizer() t.fit_on_texts(docs) vocab_size = len(t.word_index) + 1 #encode the documents encoded_docs = t.texts_to_sequences(docs) #pad docs to max length padded_docs = pad_sequences(encoded_docs, maxlen = 40, padding = 'post') labels_categorical = model_sentiment.predict(padded_docs)