def get_options_flow(): ''' The function calls the get_all_tweets function, cleans the tweet data and saves it to the SQLite database so it can be called into the app frequently and automatically without impacting performance. ''' #connect to the sqlite database conn = sqlite3.connect('stocks.sqlite') #use get_all_tweets to pull the data from the twitter users ss = get_all_tweets(screen_name="SwaggyStocks") uw = get_all_tweets(screen_name="unusual_whales") #clean the text data ss['source'] = 'swaggyStocks' ss['text'] = hero.remove_urls(ss['text']) ss['text'] = [n.replace('$', '') for n in ss['text']] #clean the text data uw['source'] = 'unusual_whales' uw['text'] = hero.remove_urls(uw['text']) uw['text'] = [n.replace('$', '') for n in uw['text']] uw['text'] = [n.replace(':', '') for n in uw['text']] uw['text'] = [n.replace('\n', ' ') for n in uw['text']] uw['text'] = [n.replace(' ', ' ') for n in uw['text']] #concat the tweets into one dataframe tweets = pd.concat([ss, uw]) #save the tweets to sqlite database tweets.to_sql('tweets', conn, if_exists='replace') return print('done')
def get_options_flow(): conn = sqlite3.connect('stocks.sqlite') ss = get_all_tweets(screen_name ="SwaggyStocks") uw = get_all_tweets(screen_name ="unusual_whales") ss['source'] = 'swaggyStocks' ss['text'] = hero.remove_urls(ss['text']) ss['text'] = [n.replace('$','') for n in ss['text']] uw['source'] = 'unusual_whales' uw['text'] = hero.remove_urls(uw['text']) uw['text'] = [n.replace('$','') for n in uw['text']] uw['text'] = [n.replace(':','') for n in uw['text']] uw['text'] = [n.replace('\n',' ') for n in uw['text']] uw['text'] = [n.replace(' ',' ') for n in uw['text']] tweets = pd.concat([ss, uw]) tweets.to_sql('tweets', conn, if_exists = 'replace') return print('done')
def get_options_flow(): #connect to the sqlite database conn = sqlite3.connect('stocks.sqlite') #use get_all_tweets to pull the data from the twitter users ss = get_all_tweets(screen_name="SwaggyStocks") uw = get_all_tweets(screen_name="unusual_whales") #clean the text data ss['source'] = 'swaggyStocks' ss['text'] = hero.remove_urls(ss['text']) ss['text'] = [n.replace('$', '') for n in ss['text']] #clean the text data uw['source'] = 'unusual_whales' uw['text'] = hero.remove_urls(uw['text']) uw['text'] = [n.replace('$', '') for n in uw['text']] uw['text'] = [n.replace(':', '') for n in uw['text']] uw['text'] = [n.replace('\n', ' ') for n in uw['text']] uw['text'] = [n.replace(' ', ' ') for n in uw['text']] #concat the tweets into one dataframe tweets = pd.concat([ss, uw]) #save the tweets to sqlite database tweets.to_sql('tweets', conn, if_exists='replace') return print('twitter done')
def cleaning_text(df_name): ''' All the steps of preprocessing :param df_name: name of the df on wich the content column must be preprocessed :return: a "clea_content" column ''' # delete pseudo strating with @ df_name['clean_content'] = df_name['content'].apply(delete_pseudo) # method clean from texthero df_name['clean_content'] = hero.clean(df_name['clean_content']) # delete stopwords with texthero default_stopwords = stopwords.DEFAULT custom_stopwords = default_stopwords.union( set([ "feel", "feeling", "im", "get", "http", "ive", "go", "day", "com", "got", "see" "4pm" ])) df_name['clean_content'] = hero.remove_stopwords(df_name['clean_content'], custom_stopwords) # remove urls df_name['clean_content'] = hero.remove_urls(df_name['clean_content']) # remove angle brakets df_name['clean_content'] = hero.remove_angle_brackets( df_name['clean_content']) # remove digits df_name['clean_content'] = hero.preprocessing.remove_digits( df_name['clean_content'], only_blocks=False)
def get_options_flow(): ss = get_all_tweets(screen_name="SwaggyStocks") uw = get_all_tweets(screen_name="unusual_whales") ss['source'] = 'swaggyStocks' ss['text'] = hero.remove_urls(ss['text']) ss['text'] = [n.replace('$', '') for n in ss['text']] uw['source'] = 'unusual_whales' uw['text'] = hero.remove_urls(uw['text']) uw['text'] = [n.replace('$', '') for n in uw['text']] uw['text'] = [n.replace(':', '') for n in uw['text']] uw['text'] = [n.replace('\n', ' ') for n in uw['text']] uw['text'] = [n.replace(' ', ' ') for n in uw['text']] tweets = pd.concat([ss, uw]) return (tweets)
def clean(content): import neattext.functions as nfx cleaning_steps = ('clean_text', 'remove_stopwords', 'remove_userhandles', 'remove_punctuations') for step in cleaning_steps: content = content.apply(getattr(nfx, step)) content = hero.remove_diacritics(content) content = hero.remove_urls(content) content = hero.preprocessing.remove_digits(content) # content = hero.remove_punctuation(content) # content = hero.remove_whitespace(content) # content = hero.preprocessing.stem(content) return content
def pipline_processing(df, STOPWORDS=None): # order of this operation matters s = hero.remove_urls(df) s = hero.remove_html_tags(s) s = hero.lowercase(s) s = s.apply(removeAt) s = hero.remove_punctuation(s) s = hero.preprocessing.remove_digits(s, only_blocks=False) s = s.apply(convert_emojis) s = s.apply(convert_emoticons) s = s.apply(lemmatize_words) s = hero.remove_stopwords(s, stopwords=STOPWORDS) s = hero.remove_whitespace(s) s = s.apply(lemmatize_words) # s = hero.tokenize(s) return s