def get_options_flow():
    '''
    The function calls the get_all_tweets function, 
    cleans the tweet data and saves it to the SQLite database 
    so it can be called into the app frequently and automatically 
    without impacting performance.
    '''
    #connect to the sqlite database
    conn = sqlite3.connect('stocks.sqlite')
    #use get_all_tweets to pull the data from the twitter users
    ss = get_all_tweets(screen_name="SwaggyStocks")
    uw = get_all_tweets(screen_name="unusual_whales")

    #clean the text data
    ss['source'] = 'swaggyStocks'
    ss['text'] = hero.remove_urls(ss['text'])
    ss['text'] = [n.replace('$', '') for n in ss['text']]

    #clean the text data
    uw['source'] = 'unusual_whales'
    uw['text'] = hero.remove_urls(uw['text'])
    uw['text'] = [n.replace('$', '') for n in uw['text']]
    uw['text'] = [n.replace(':', '') for n in uw['text']]
    uw['text'] = [n.replace('\n', ' ') for n in uw['text']]
    uw['text'] = [n.replace('  ', ' ') for n in uw['text']]

    #concat the tweets into one dataframe
    tweets = pd.concat([ss, uw])
    #save the tweets to sqlite database
    tweets.to_sql('tweets', conn, if_exists='replace')
    return print('done')
Пример #2
0
def get_options_flow():

    conn = sqlite3.connect('stocks.sqlite')


    ss = get_all_tweets(screen_name ="SwaggyStocks")
    uw = get_all_tweets(screen_name ="unusual_whales")

    ss['source'] = 'swaggyStocks'
    ss['text'] = hero.remove_urls(ss['text'])
    ss['text'] = [n.replace('$','') for n in ss['text']]


    uw['source'] = 'unusual_whales'
    uw['text'] = hero.remove_urls(uw['text'])
    uw['text'] = [n.replace('$','') for n in uw['text']]
    uw['text'] = [n.replace(':','') for n in uw['text']]
    uw['text'] = [n.replace('\n',' ') for n in uw['text']]
    uw['text'] = [n.replace('  ',' ') for n in uw['text']]



    tweets = pd.concat([ss, uw])
    tweets.to_sql('tweets', conn, if_exists = 'replace')

    return print('done')
Пример #3
0
def get_options_flow():

    #connect to the sqlite database
    conn = sqlite3.connect('stocks.sqlite')

    #use get_all_tweets to pull the data from the twitter users
    ss = get_all_tweets(screen_name="SwaggyStocks")
    uw = get_all_tweets(screen_name="unusual_whales")

    #clean the text data
    ss['source'] = 'swaggyStocks'
    ss['text'] = hero.remove_urls(ss['text'])
    ss['text'] = [n.replace('$', '') for n in ss['text']]

    #clean the text data
    uw['source'] = 'unusual_whales'
    uw['text'] = hero.remove_urls(uw['text'])
    uw['text'] = [n.replace('$', '') for n in uw['text']]
    uw['text'] = [n.replace(':', '') for n in uw['text']]
    uw['text'] = [n.replace('\n', ' ') for n in uw['text']]
    uw['text'] = [n.replace('  ', ' ') for n in uw['text']]

    #concat the tweets into one dataframe
    tweets = pd.concat([ss, uw])
    #save the tweets to sqlite database
    tweets.to_sql('tweets', conn, if_exists='replace')
    return print('twitter done')
Пример #4
0
def cleaning_text(df_name):
    '''
    All the steps of preprocessing
    :param df_name: name of the df on wich the content column must be preprocessed
    :return: a "clea_content" column
    '''
    # delete pseudo strating with @
    df_name['clean_content'] = df_name['content'].apply(delete_pseudo)
    # method clean from texthero
    df_name['clean_content'] = hero.clean(df_name['clean_content'])
    # delete stopwords with texthero
    default_stopwords = stopwords.DEFAULT
    custom_stopwords = default_stopwords.union(
        set([
            "feel", "feeling", "im", "get", "http", "ive", "go", "day", "com",
            "got", "see"
            "4pm"
        ]))
    df_name['clean_content'] = hero.remove_stopwords(df_name['clean_content'],
                                                     custom_stopwords)
    # remove urls
    df_name['clean_content'] = hero.remove_urls(df_name['clean_content'])
    # remove angle brakets
    df_name['clean_content'] = hero.remove_angle_brackets(
        df_name['clean_content'])
    # remove digits
    df_name['clean_content'] = hero.preprocessing.remove_digits(
        df_name['clean_content'], only_blocks=False)
def get_options_flow():

    ss = get_all_tweets(screen_name="SwaggyStocks")
    uw = get_all_tweets(screen_name="unusual_whales")

    ss['source'] = 'swaggyStocks'
    ss['text'] = hero.remove_urls(ss['text'])
    ss['text'] = [n.replace('$', '') for n in ss['text']]

    uw['source'] = 'unusual_whales'
    uw['text'] = hero.remove_urls(uw['text'])
    uw['text'] = [n.replace('$', '') for n in uw['text']]
    uw['text'] = [n.replace(':', '') for n in uw['text']]
    uw['text'] = [n.replace('\n', ' ') for n in uw['text']]
    uw['text'] = [n.replace('  ', ' ') for n in uw['text']]

    tweets = pd.concat([ss, uw])

    return (tweets)
Пример #6
0
def clean(content):
    import neattext.functions as nfx
    cleaning_steps = ('clean_text', 'remove_stopwords', 'remove_userhandles',
                      'remove_punctuations')
    for step in cleaning_steps:
        content = content.apply(getattr(nfx, step))
    content = hero.remove_diacritics(content)
    content = hero.remove_urls(content)
    content = hero.preprocessing.remove_digits(content)
    # content = hero.remove_punctuation(content)
    # content = hero.remove_whitespace(content)
    # content = hero.preprocessing.stem(content)
    return content
Пример #7
0
def pipline_processing(df, STOPWORDS=None):


    # order of this operation matters
    s = hero.remove_urls(df)
    s = hero.remove_html_tags(s)
    s = hero.lowercase(s)
    s = s.apply(removeAt)
    s =  hero.remove_punctuation(s)
    s = hero.preprocessing.remove_digits(s, only_blocks=False)
    s = s.apply(convert_emojis)
    s = s.apply(convert_emoticons)
    s = s.apply(lemmatize_words)
    s = hero.remove_stopwords(s, stopwords=STOPWORDS)
    s = hero.remove_whitespace(s)
    s = s.apply(lemmatize_words)

    # s = hero.tokenize(s)

    return s