Python clean示例，preprocessor.api.clean Python示例

示例#1

0

显示文件

def getTweets(hashtag):    
    try:
        posts = api.user_timeline(screen_name=hashtag, count = 10000, lang ="en", tweet_mode="extended")
        tweetData = []
        for post in posts:
            result = {}
            result['created_at'],result['Tweet'],result['user'] = str(post.created_at),post.full_text,post.user.name
            result['screen_name'],result['location'],result['about'] = post.user.screen_name,post.user.location,post.user.description
            result['followers'],result['following'] = post.user.followers_count,post.user.friends_count
            result['retweetCount'],result['likeCount'] = post.retweet_count,post.favorite_count
            result['cleanedTweet'] = clean_tweets(clean(post.full_text))
            result['processedTweet'] = clean(post.full_text)
            
            blob = TextBlob(str(result['cleanedTweet']))
            Sentiment = blob.sentiment     
            polarity,subjectivity = Sentiment.polarity,Sentiment.subjectivity
            sentimentClass = 'Negative' if polarity < 0 else 'Positive' if polarity > 0 else 'Neutral'
            
            result['SentimentScore'],result['Polarity'],result['Subjectivity'] = sentimentClass,polarity,subjectivity
            tweetData.append(result)
    
        tweet_df = pd.DataFrame(tweetData)
        return tweet_df
    except Exception as e:
        print ('Error on line {}'.format(sys.exc_info()[-1].tb_lineno),Exception, e)

示例#2

0

显示文件

文件： tweet.py 项目： cxnnate/sentiment-bot-detection

    def clean_tweet(self, text=None):
        # emoji_pattern + re.compile
        if text is None:
            text = self.text_

        tweet = p.clean(text)
        tweet = re.sub(r':', '', tweet)
        return tweet

示例#3

0

显示文件

def clean_tweets(df):
    tempArr = []
    for line in df:
        # send to tweet_processor
        tmpL = clean(str(line))
        # remove puctuation
        tmpL = REPLACE_NO_SPACE.sub(
            "", tmpL.lower())  # convert all tweets to lower cases
        tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
        tempArr.append(tmpL)
    return tempArr

示例#4

0

显示文件

文件： _preprocess.py 项目： kanav-mehra/solve-iwmi

def preprocessDataFrame(df):
    """
    Function to run the preprocessing pipeline on all tweets to generate
    the feature "full_text_processed": Translating tweets to English, removing
    stopwords & lemmatization, removing URLs and reserved words, lowercasing &
    punctuation removal and VADER sentiment analysis.

    Parameters
    ----------
    df : DataFrame
        Transformed DataFrame with original tweets

    Returns:
    df : DataFrame
        DataFrame with processed tweets
    """
    df['full_text_processed'] = df.apply(
        lambda x: translate_func(x, 'full_text', 'lang'), axis=1)

    # for some reason some rows are type float so make sure nothing will crash
    df['full_text_processed'] = df['full_text_processed'].astype(str)

    api.set_options('urls', 'reserved_words')
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: api.clean(x))
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: x.lower())

    def remove_punct(text):
        table = str.maketrans('', '', string.punctuation)
        return text.translate(table)

    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: remove_punct(x))

    lemmatizer = nltk.stem.WordNetLemmatizer()
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

    stop_words = set(stopwords.words('english'))
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: ' '.join(
            [word for word in x.split() if word not in stop_words]))

    sid = SentimentIntensityAnalyzer()

    def create_sentiment(x, text):
        return sid.polarity_scores(text)['compound']

    # add sentiment as part of preprocessing
    df['sentiment'] = df.apply(
        lambda x: create_sentiment(x, x['full_text_processed']), axis=1)

    return df

示例#5

0

显示文件

文件： twitter_consent.py 项目： yerlansharipov/customer_consent

def data_preprocessing (tweets_raw):

    clean_text = clean(tweets_raw)

    #print("### Info: Cleaning the data...", end='')
    #tweets_clean = pd.DataFrame(columns=["Date","Location", "Text"])
    """
    for i in range(tweets_raw.shape[0]):
        clean_text = clean(tweets_raw["Text"][i])
        tweets_raw["Text"][i] = clean_text
    """
    #print("DONE!")

    return tweets_raw

示例#6

0

显示文件

def clean_pipeline(text):
    '''
    Function for apply. Performs preprocess.api clean function to raw text.
    
    args:
        -text: string.
    '''
    toret = unicode_to_ascii(text)
    toret = remove_punctuation(toret)
    toret = clean(toret)
    toret = toret.lower()  # All lowercase
    toret = remove_urls(toret)
    toret = remove_double_whitespaces(toret)  # Remove double spaces
    toret = remove_lspace(toret)  # Remove leading space
    toret = remove_rspace(toret)
    return toret

示例#7

0

显示文件

    def clean(self, text):
        """
        Parses and cleans a tweet
        :param: Text of the tweet
        :return: A cleaned tweet
        TODO: Update data cleaning function
        """
        if text is None:
            text = self.text_

        # Remove punctuation, username, other elements from text
        text = text.translate(str.maketrans('','',string.punctuation))
        if re.match(r'^([RT])\w', text):
            tokens = text.split(' ')
            text = ' '.join(tokens[2:])
       
        tweet = p.clean(text)
        tweet = re.sub(r':', '', tweet)

        return tweet.lower()

示例#8

0

显示文件

train.head(10)

train = train[['selected_text', 'sentiment']]
train.head()

train['sentiment'].unique()

train.groupby('sentiment').nunique()

train["selected_text"].fillna("No content", inplace=True)

temp = []
#Splitting pd.Series to list
data_to_list = train['selected_text'].values.tolist()
for i in range(len(data_to_list)):
    temp.append(clean(data_to_list[i]))
list(temp[:5])


def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)
               )  # deacc=True removes punctuations


data_words = list(sent_to_words(temp))
print(data_words[:10])

len(data_words)

示例#9

0

显示文件

        public_tweets = tweepy.Cursor(api.search, q=search_term,
                                      lang="en").items(no_of_search_items)

        index = 0
        if os.path.isfile('./tweetbykeyword.csv'):
            my_csv_file = open('tweetbykeyword.csv', 'r+')
            reader = csv.DictReader(my_csv_file)
            field_names = ['Index', 'Keyword', 'Tweets']
            for each_row in reader:
                if search_term == each_row['Keyword']:
                    index += 1
            writer = csv.DictWriter(my_csv_file, fieldnames=field_names)
        else:
            my_csv_file = open('tweetbykeyword.csv', 'w')
            field_names = ['Index', 'Keyword', 'Tweets']
            writer = csv.DictWriter(my_csv_file, fieldnames=field_names)
            writer.writeheader()

        for each_tweet in public_tweets:
            data = p.clean(each_tweet.text)
            data = data.encode('utf-8')
            data = data.decode('unicode_escape')
            writer.writerow({'Index': index, 'Keyword': search_term, \
                             'Tweets': data})
            index += 1

    elif int(user_input) == 2:
        loop = False
    else:
        print('Please enter 1 or 2')

示例#10

0

显示文件

def get_stock_sentiment(stock_symbol):
    symbol = stock_symbol.split(".")[0].lower() + " stock"
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CUSTOMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_TOKEN)
    user = tweepy.API(auth)
    tweets = tweepy.Cursor(user.search, q=str(symbol), tweet_mode='extended', lang='en',exclude_replies=True).items(NUM_OF_TWEETS)
    tweet_list = [] #List of tweets alongside polarity
    global_polarity = 0 #Polarity of all tweets === Sum of polarities of individual tweets
    tw_list=[] #List of tweets only => to be displayed on web page
    #Count Positive, Negative to plot pie chart
    pos=0 #Num of pos tweets
    neg=1 #Num of negative tweets
    for tweet in tweets:
        count=20 #Num of tweets to be displayed on web page
        #Convert to Textblob format for assigning polarity
        tw2 = tweet.full_text
        tw = tweet.full_text
        #Clean
        tw=clean(tw)
        #print("-------------------------------CLEANED TWEET-----------------------------")
        #print(tw)
        #Replace &amp; by &
        tw=re.sub('&amp;','&',tw)
        #Remove :
        tw=re.sub(':','',tw)
        #print("-------------------------------TWEET AFTER REGEX MATCHING-----------------------------")
        #print(tw)
        #Remove Emojis and Hindi Characters
        tw=tw.encode('ascii', 'ignore').decode('ascii')

        #print("-------------------------------TWEET AFTER REMOVING NON ASCII CHARS-----------------------------")
        #print(tw)
        blob = TextBlob(tw)
        polarity = 0 #Polarity of single individual tweet
        for sentence in blob.sentences:
            polarity += sentence.sentiment.polarity
            if polarity>0:
                pos=pos+1
            if polarity<0:
                neg=neg+1

            global_polarity += sentence.sentiment.polarity
        if count > 0:
            tw_list.append(tw2)

        #tweet_list.append(Tweet(tw, polarity))
        count=count-1
    try:
    	global_polarity = global_polarity / len(tw_list)
    except:
    	pass

    neutral=NUM_OF_TWEETS-pos-neg
    if neutral<0:
        neg=neg+neutral
        neutral=20
    labels=['Positive','Negative','Neutral']
    sizes = [pos,neg,neutral]
    explode = (0, 0, 0)
    fig = plt.figure(figsize=(7.2,4.8),dpi=65)
    fig1, ax1 = plt.subplots(figsize=(7.2,4.8),dpi=65)
    ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=90)
    # Equal aspect ratio ensures that pie is drawn as a circle
    ax1.axis('equal')  
    plt.tight_layout()
    plt.savefig("static/"+stock_symbol+"_sa"+'.png')
    plt.close(fig)
    #plt.show()
    if pos-neg > 10:
    	suggestion = " According to sentiment analysis, stock price may RISE"
    	tw_pol = " Overall Positive"
        
    elif neg-pos > 10:
        suggestion = " According to sentiment analysis, stock price may FALL"
        tw_pol = " Overall Negative"

    else:
        suggestion = " According to sentiment analysis, there wont be much volatilty in the stock price"
        tw_pol = " Almost neautral "


    return suggestion, tw_list, tw_pol

示例#11

0

显示文件

def clean_text(val):
    val = p.clean(val)
    val = re.sub('@(\w+)','',val)
    val = re.sub('#',"", val)
    val = punctuation(val)
    return val

示例#12

0

显示文件

from keras.layers.embeddings import Embedding
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
print(tf.__version__)

df = pd.read_csv("text_emotion.csv")
df.head()

## Step 1: Data pre-processing


## Remove mentions and "#" symbol in tweet
df['clean_content'] = df.content.apply(lambda x: re.sub('@(\w+)', '', x))
df['clean_content'] = df.clean_content.apply(lambda x: re.sub('#', "", x))

## Clean using the tweet-processing package, removing emojis and urls
df['clean_content'] = df.clean_content.apply(lambda x: p.clean(x))


## Remove unnecessary punctuation in the data, but tag ! and ?

def punctuation(val):
    punctuations = '''()-[]{};:'"\,<>./@#$%^&_~'''

    for x in val.lower():
        if x in punctuations:
            val = val.replace(x, " ")
        elif x == "!":
            val = val.replace(x, " XXEXLMARK ")
        elif x == "?":
            val = val.replace(x, " XXQUESMARK ")
    return val

示例#13

0

显示文件

import os
import tweepy as tw
import pandas as pd
import preprocessor.api as p
from preprocessor.api import clean, tokenize, parse

consumer_key = 'AAAAAAAAAAAAAAAAAAAAAAAA'
consumer_secret = 'BBBBBBBBBBBBBBBBBBBBBBBBB'

access_token = 'CCCCCCCCCCCCCCCCCCCCCCCC'
access_token_secret = 'DDDDDDDDDDDDDDDDDDDDDDD'

# perform authentication
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Create our twitter api to access tweets from it
api = tw.API(auth)

search_words = "Korea"
date_since = "2019-11-16"

# Collect tweets
tweets = tw.Cursor(api.search, q=search_words, lang="en",
                   since=date_since).items(5)

# Iterate and print tweets
for tweet in tweets:
    print(p.clean(tweet.text))

示例#14

0

显示文件

def write_tweets(keyword, file):
    # If the file exists, then read the existing data from the CSV file.
    if os.path.exists(file):
        df = pd.read_csv(file, header=0)
    else:
        df = pd.DataFrame(columns=COLS)
    #page attribute in tweepy.cursor and iteration
    for page in tweepy.Cursor(api.search, q=keyword,
                              count=200, include_rts=False, since=start_date, tweet_mode="extended").pages(100):
        for status in page:
            new_entry = []
            status = status._json

            #when run the code, below code replaces the retweet amount and
            #no of favorires that are changed since last download.
            if status['created_at'] in df['created_at'].values:
                i = df.loc[df['created_at'] == status['created_at']].index[0]
                if status['favorite_count'] != df.at[i, 'favorite_count'] or \
                   status['retweet_count'] != df.at[i, 'retweet_count']:
                    df.at[i, 'favorite_count'] = status['favorite_count']
                    df.at[i, 'retweet_count'] = status['retweet_count']
                continue

           #tweepy preprocessing called for basic preprocessing
            clean_text = clean(status['full_text'])

            #call clean_tweet method for extra preprocessing
                
            filtered_tweet=clean_tweets(clean_text)
           
            #new entry append
            new_entry += [status['id'], status['created_at'],
                          status['source'], status['full_text'],filtered_tweet,  status['lang'],
                          status['favorite_count'], status['retweet_count']]

            #to append original author of the tweet
            new_entry.append(status['user']['screen_name'])

            try:
                is_sensitive = status['possibly_sensitive']
            except KeyError:
                is_sensitive = None
            new_entry.append(is_sensitive)

            # hashtagas and mentiones are saved using comma separted
            hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
            new_entry.append(hashtags)
            mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
            new_entry.append(mentions)

            #get location of the tweet if possible
            try:
                location = status['user']['location']
            except TypeError:
                location = ''
            new_entry.append(location)

            try:
                coordinates = [coord for loc in status['place']['bounding_box']['coordinates'] for coord in loc]
            except TypeError:
                coordinates = None
            new_entry.append(coordinates)

            single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
            df = df.append(single_tweet_df, ignore_index=True)
            csvFile = open(file, 'a' ,encoding='utf-8')
    df.to_csv(csvFile, mode='a', columns=COLS, index=False, encoding="utf-8")