示例#1
0
def process_tweets(dftw):
    print('\tvik : In process tweets')

    #---drop some tweets---
    kywrds = ['tsla', 'tesla', 'elon', 'musk']
    badwords = [
        'n***a', 'nigger', 'gay', 'pussy', 'pussc', 'c**t', 'f**k', 'dick',
        'c**k', 'suck', 'w***e', 'pimp', 'wtf', 'asshole', 'bitch'
    ]

    #Get rid of any tweets which do not contain relevant keywords. Even though the API uses keywords to search
    #some tweets do not contain these keywords (like tsla, tesla) since the user name might contain these keywords.
    #Get rid of any tweets which are replies.

    for index, row in dftw.iterrows():
        txt = row['tweet']
        ctxt = tc.p.clean(txt)
        if (not any(wrd in ctxt.lower() for wrd in kywrds)) or (any(
                wrd in ctxt.lower() for wrd in badwords)) or (ctxt[0] == '@'):
            dftw.drop(index, axis=0, inplace=True)

    #Get rid of tweets with too many cashtags
    for index, row in dftw.iterrows():
        txt = row['tweet']
        ctxt = tc.p.clean(txt)
        if tc.count_cashtags(ctxt) > 0:
            dftw.drop(index, axis=0, inplace=True)

    #Get rid of tweets with zero words.
    for index, row in dftw.iterrows():
        txt = row['tweet']
        ctxt = tc.p.clean(txt)
        ctxt = tc.remove_mention(ctxt)
        ctxt = tc.remove_hashtag(ctxt)
        ctxt = tc.remove_cashtag(ctxt)
        if len(ctxt.split()) < 1:
            dftw.drop(index, axis=0, inplace=True)

    nrows = dftw.shape[0]

    dftw.reset_index(inplace=True, drop=True)

    if nrows > 0:
        dftw = predict_label(dftw)

    return dftw
示例#2
0
def predict_label(dftw):

    dftw['tidy_tweet'] = dftw['tweet'].apply(lambda x: tc.p.clean(x))
    dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(
        lambda x: tc.remove_hashtag(x))
    dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(
        lambda x: tc.replace_tesla(x))
    dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(
        lambda x: tc.remove_mention(x))
    dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(
        lambda x: tc.replace_chars(x))
    dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(
        lambda x: tc.normalize_doc(x))

    nrows = dftw.shape[0]

    load_model = pickle.load(open('save_model/bayes_count.sav', 'rb'))
    count_vect = pickle.load(open('save_model/count_vect.sav', 'rb'))

    ser = dftw['tidy_tweet']
    ser_count = count_vect.transform(ser)

    predictions = load_model.predict(ser_count)
    proba = load_model.predict_proba(ser_count)

    ii = 0
    kk = 0
    for index, row in dftw.iterrows():
        if (proba[ii][0] > 0.8 or proba[ii][1] > 0.8):
            kk += 1
        else:
            dftw.drop(index, axis=0, inplace=True)
        ii += 1


# print('\n\tii,kk = ',ii,kk)
# print('\n\tdftw.shape = ',dftw.shape)

    return dftw
示例#3
0
    # print(df[df.index.duplicated(keep=False)])

    df['senti'] = df['senti'].astype('category')

    # --------Clean text-----------

    # Delete tweets with more than 10 cashtags
    cond10 = df['text'].apply(
        lambda x: tc.count_cashtags(x) > cvars.cash_thresh)
    df.drop(index=df[cond10].index, inplace=True)
    print('\n\tdf.shape = ', df.shape)

    df['tidy_text'] = df['text'].apply(lambda x: tc.clean_emoji_url(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_hashtag(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_cashtag(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.remove_mention(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.replace_chars(x))
    df['tidy_text'] = df['tidy_text'].apply(lambda x: tc.normalize_doc(x))

    # Drop rows with empty tidy_text. After cleaning it is possible that all
    # the tokens in tidy_text get deleted. For example, the following tweet
    # after cleaning contains zero tokens.
    # $CUBE $EXR $HOG $KO $LSI $PSA $IRM https://t.co/GFZTPvIifx

    cond = df['tidy_text'].apply(lambda x: tc.count_toks(x) == 0)

    print('\n\tcond.shape = ', cond.shape)
    print('\n\tcond.value_counts = ', cond.value_counts())

    df.drop(index=df[cond].index, inplace=True)
示例#4
0
def sentiment_tweets():

    print('\n\t\tEntered sentiment_tweets : ')

    # Deserialize the model
    with open('data/bayes_fit.pkl', 'rb') as f:
        count_vect, model = pickle.load(f)

    # On entry (very first time the app is launched), make sure the file exists
    while True:
        if os.path.exists('data_tweets/streaming_tweets_save.csv'):
            break
        else:
            print('\t\tsentiment_tweets() : Sleeping')
            time.sleep(20)

    dftw = pd.DataFrame(columns=cvars.cols_senti)

    skiprows = 0
    while True:
        # Empty out dataframe, to be sure
        dftw.drop(dftw.index, inplace=True)

        # Read in only the latest tweets (Within last nsecs) using skiprows.
        dftw = pd.read_csv('data_tweets/streaming_tweets_save.csv',
                          names=cvars.cols, skiprows=skiprows)

        shp = dftw.shape

        cond_cash = dftw['tweet'].apply(lambda x: tc.count_cashtags(x) > cvars.cash_thresh)
        dftw.drop(index=dftw[cond_cash].index, inplace=True)

        # print('\n\t^^ skiprows = ',skiprows)
        # print('\n^^ shape = ',dftw.shape)

        if not dftw.empty:

            dftw['tidy_tweet'] = dftw['tweet'].apply(lambda x: tc.clean_emoji_url(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_hashtag(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_cashtag(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.remove_mention(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.replace_chars(x))
            dftw['tidy_tweet'] = dftw['tidy_tweet'].apply(lambda x: tc.normalize_doc(x))

            cond = dftw['tidy_tweet'].apply(lambda x: tc.count_toks(x) == 0)
            dftw.drop(index=dftw[cond].index, inplace=True)

            if not dftw.empty:
                # iterrows is inefficient. However the number of rows being
                # processed is small.
                for indx, row in dftw.iterrows():
                    dftw.loc[indx, 'senti'] = model.predict(count_vect.transform([row['tidy_tweet']]))
                dftw['wt_senti'] = dftw.apply(
                                   lambda x:
                                   weighted_senti(x['senti'],
                                   x['retweet_count'] + x['favorite_count'],
                                   x['verified'],
                                   x['followers_count'] + x['friends_count']),
                                   axis=1)
                dftw[cvars.cols_display].to_csv('data_tweets/senti_tweets.csv',
                                          mode='a', header=False, index=False)

        skiprows += shp[0]
        time.sleep(cvars.nsecs)
示例#5
0
 def test_mention3(self):
     checkstr = tc.remove_mention(
         'Elon will Confirm that Tesla is Unstoppable before Battery Day - Heres ... via @YouTube'
     )
     outstr = 'Elon will Confirm that Tesla is Unstoppable before Battery Day - Heres ... via '
     self.assertEqual(checkstr, outstr)
示例#6
0
 def test_mention2(self):
     checkstr = tc.remove_mention(
         'OKE new 52 week high of $71.78 $OKE https://t.co/Kul3gUQMP1 @benzinga'
     )
     outstr = 'OKE new 52 week high of $71.78 $OKE https://t.co/Kul3gUQMP1 '
     self.assertEqual(checkstr, outstr)
示例#7
0
 def test_mention1(self):
     checkstr = tc.remove_mention(
         'So when are we going to get @LEGO_Group models of @Tesla cars, @elonmusk?'
     )
     outstr = 'So when are we going to get  models of  cars, ?'
     self.assertEqual(checkstr, outstr)