Python clean示例，preprocessor.clean Python示例

示例#1

0

显示文件

文件： preprocess.py 项目： PFAWeb2Control/Application

def set_sentence_(sentence):
	p.set_options(p.OPT.URL, p.OPT.EMOJI)
	sentence=p.clean(sentence)
	sentence=hashtag_power(sentence)
	p.set_options(p.OPT.HASHTAG)
	sentence=p.clean(sentence)
	sentence=punc(sentence)
	sentence=Enleve_Accents(sentence)
	return sentence

示例#2

0

显示文件

文件： data_handler.py 项目： milanjain81/analysis

def get_data(uuid, server, port, title, start, end):
    # URL of the dataset
    url = "http://" + server + ":" + port + "/api/data/uuid/" + uuid + "?endtime=" + str(int(end)*1000) + \
          "&starttime=" + str(int(start)*1000) + "&limit=-1"

    # Request for the data in json format
    print url

    try:
        r = requests.get(url)
    except Exception as e:
        print e
        sys.exit(1)

    archiver_data = r.json()

    if len(archiver_data[0]['Readings']) == 0:
        sys.exit(1)

    # Convert to dataframe
    df = pandas.DataFrame(archiver_data[0]['Readings'], columns=["Timestamp",title])
    df = df.set_index('Timestamp')
    df.index = pandas.to_datetime(df.index, unit='ms')
    df = df.tz_localize("UTC").tz_convert("Asia/Kolkata")

    df = clean(df, title)

    # Return series having datetimeindex to resample and further processing
    return df

示例#3

0

显示文件

文件： stream_categorie.py 项目： meelamri/machine-learning-apps

 def action(self, tweets_list):
     corpus = []
     for t in tweets_list:
         tweet = t["text"]
         tweet = p.clean(tweet.encode("utf-8"))
         #tweet = set_sentence(tweet.encode("utf-8"))
        
         s=get_categorie_([tweet])
         if (s != Nocat):
             corpus.append(tweet)
             t["cat"]=s
             print tweet
             print t

示例#4

0

显示文件

文件： get_cat_terms.py 项目： sarath1/EventExtraction

def get_twitter():
	import pandas as pd
	import preprocessor as p

	with open("2014_india_floods.csv") as fl:
		data = pd.read_csv(fl)

	tweet_id = data['tweet_id'].tolist()
	text = data['tweet_text'].tolist()
	text = [p.clean(t) for t in text]
	labels = data['choose_one_category'].tolist()

	return (text, labels)

示例#5

0

显示文件

文件： assembler.py 项目： joeylmaalouf/asm-simulator

 def __init__(self, program, mode = "MIPS"):
   super(Assembler, self).__init__()
   try:                   text = program.read()
   except AttributeError: text = program
   self.mode = mode.upper()
   self.registers = Registers(self.mode)
   lines = text.split("\n")
   lines = clean(lines, self.mode)
   instrs, data = split_sections(lines)
   self.memory = Memory()
   for d in data: self.memory.insert(d)
   instrs = preprocess(instrs, self.mode)
   self.labels = label_positions(instrs)
   self.instructions = [Instruction(instr) for instr in instrs]

示例#6

0

显示文件

文件： parser.py 项目： masterdcups/iot-tweet-search-engine

	def clean_tweet(self, tweet_text):
		"""
		Taking a raw tweet, return a cleaned list of tweets tokens
		:param tweet_text:
		:return: array of tokens words
		"""

		tweet = preprocessor.clean(tweet_text)
		tokens = [word[1:] if word.startswith('#') else word for word in tweet.split(' ')]

		tokens = self.replace_abbreviations(tokens)
		tokens = self.remove_stopwords_spelling_mistakes(tokens)
		tokens = gensim.utils.simple_preprocess(' '.join(tokens))

		return tokens

示例#7

0

显示文件

tweet_senti = []
tweet_dict = {}
ptweet=0
ntweet=0
ntrl=0
flag=0

connection = MongoClient()
db = connection.indian_national_congress
posts = db.bjp

count = posts.find().count()
print (count)

p.set_options(p.OPT.RESERVED, p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.NUMBER, p.OPT.SMILEY, p.OPT.EMOJI)   # remove smiley and emoji if they are to be included in final text

for d in posts.find({},{'text':1,'_id': False}):
    tmp_item = str(d)
    tmp_list.append(p.clean(tmp_item))

for item in tmp_list:
    item=item.strip()
    item=item.replace('RT','')
    item = item.replace('\\n', '')
    item = " ".join(re.findall("[a-zA-Z]+",item))
    tmp_var = re.sub(r'^\S*\s', '', item)
    clean_list.append(tmp_var)


df= pd.DataFrame(clean_list,columns = ["text"])
df.to_csv('inc.csv', index=False)

示例#8

0

显示文件

def write_tweets(keyword, file):
    # If the file exists, then read the existing data from the CSV file.
    if os.path.exists(file):
        df = pd.read_csv(file, header=0)
    else:
        df = pd.DataFrame(columns=COLS)
    #page attribute in tweepy.cursor and iteration
    for page in tweepy.Cursor(api.search,
                              q=keyword,
                              count=200,
                              include_rts=False,
                              since=start_date).pages(50):
        for status in page:
            new_entry = []
            status = status._json

            ## check whether the tweet is in english or skip to the next tweet
            if status['lang'] != 'en':
                continue

            #when run the code, below code replaces the retweet amount and
            #no of favorires that are changed since last download.
            if status['created_at'] in df['created_at'].values:
                i = df.loc[df['created_at'] == status['created_at']].index[0]
                if status['favorite_count'] != df.at[i, 'favorite_count'] or \
                   status['retweet_count'] != df.at[i, 'retweet_count']:
                    df.at[i, 'favorite_count'] = status['favorite_count']
                    df.at[i, 'retweet_count'] = status['retweet_count']
                continue

        #tweepy preprocessing called for basic preprocessing
            clean_text = p.clean(status['text'])

            #call clean_tweet method for extra preprocessing
            filtered_tweet = clean_tweets(clean_text)

            #pass textBlob method for sentiment calculations
            blob = TextBlob(filtered_tweet)
            Sentiment = blob.sentiment

            #seperate polarity and subjectivity in to two variables
            polarity = Sentiment.polarity
            subjectivity = Sentiment.subjectivity

            #new entry append
            new_entry += [
                status['id'], status['created_at'], status['source'],
                status['text'], filtered_tweet, Sentiment, polarity,
                subjectivity, status['lang'], status['favorite_count'],
                status['retweet_count']
            ]

            #to append original author of the tweet
            new_entry.append(status['user']['screen_name'])

            try:
                is_sensitive = status['possibly_sensitive']
            except KeyError:
                is_sensitive = None
            new_entry.append(is_sensitive)

            # hashtagas and mentiones are saved using comma separted
            hashtags = ", ".join([
                hashtag_item['text']
                for hashtag_item in status['entities']['hashtags']
            ])
            new_entry.append(hashtags)
            mentions = ", ".join([
                mention['screen_name']
                for mention in status['entities']['user_mentions']
            ])
            new_entry.append(mentions)

            #get location of the tweet if possible
            try:
                location = status['user']['location']
            except TypeError:
                location = ''
            new_entry.append(location)

            try:
                coordinates = [
                    coord
                    for loc in status['place']['bounding_box']['coordinates']
                    for coord in loc
                ]
            except TypeError:
                coordinates = None
            new_entry.append(coordinates)

            single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
            df = df.append(single_tweet_df, ignore_index=True)
            csvFile = open(file, 'a', encoding='utf-8')
    df.to_csv(csvFile, mode='a', columns=COLS, index=False, encoding="utf-8")

示例#9

0

显示文件

文件： spd_filter.py 项目： ijdutse/spd

 def extractor(
     self, data_source
 ):  # accept list of files consisting of raw tweets in form of json object
     #self.data_source = data_source
     data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
                      'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
                      'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
                      'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
     non_english_tweets = 0  # keep track of the non-English tweets
     with codecs.open(
             self.data_source,
             'r') as f:  # data_source is read from extractor() function
         for line in f.readlines():
             non_English = 0
             try:
                 line = json.loads(line)
                 if line['lang'] in [
                         'en', 'en-gb', 'en-GB', 'en-AU', 'en-IN', 'en_US'
                 ]:
                     data_extracts['Language'].append(line['user']['lang'])
                     data_extracts['TweetID'].append(line['id_str'])
                     data_extracts['RawTweets'].append(line['text'])
                     data_extracts['CleanTweets'].append(
                         p.clean(line['text']))
                     data_extracts['CreatedAt'].append(line['created_at'])
                     data_extracts['AccountCreated'].append(
                         line['user']['created_at'])
                     data_extracts['ScreenName'].append(
                         line['user']['screen_name'])
                     data_extracts['RetweetCount'].append(
                         line['retweet_count'])
                     data_extracts['FollowersCount'].append(
                         line['user']['followers_count'])
                     data_extracts['FriendsCount'].append(
                         line['user']['friends_count'])
                     data_extracts['StatusesCount'].append(
                         line['user']['statuses_count'])
                     data_extracts['FavouritesCount'].append(
                         line['user']['favourites_count'])
                     data_extracts['UserName'].append(line['user']['name'])
                     data_extracts['Location'].append(
                         line['user']['location'])
                     data_extracts['Description'].append(
                         line['user']['description'])
                     data_extracts['UserURL'].append(line['user']['url'])
                     data_extracts['VerifiedAccount'].append(
                         line['user']['verified'])
                     data_extracts['UserID'].append(line['user']['id'])
                     ################
                     data_extracts['TimeZone'].append(
                         line['user']['time_zone'])
                     data_extracts['TweetFavouriteCount'].append(
                         line['favorite_count'])
                     #extracts['Coordinates'].append(line['coordinates'])
                 else:
                     non_english_tweets += 1
             except:
                 continue
         df0 = pd.DataFrame(
             data_extracts)  #convert data extracts to pandas DataFrame
         df0['CreatedAt'] = pd.to_datetime(
             data_extracts['CreatedAt'],
             errors='coerce')  # convert to datetime
         df0['AccountCreated'] = pd.to_datetime(
             data_extracts['AccountCreated'], errors='coerce')
         df0 = df0.dropna(subset=['AccountCreated',
                                  'CreatedAt'])  # drop na in datetime
         AccountAge = []  # compute the account age of accounts
         date_format = "%Y-%m-%d  %H:%M:%S"
         for dr, dc in zip(df0.CreatedAt, df0.AccountCreated):
             #try:
             dr = str(dr)
             dc = str(dc)
             d1 = datetime.strptime(dr, date_format)
             d2 = datetime.strptime(dc, date_format)
             dif = d1 - d2
             AccountAge.append(dif.days)
             #except:
             #   continue
         df0['AccountAge'] = AccountAge
         # add/define additional features ...
         df0['Retweets'] = df0.RawTweets.apply(
             lambda x: str(x).split()[0] == 'RT')
         df0['RawTweetsLen'] = df0.RawTweets.apply(
             lambda x: len(str(x)))  # modified
         df0['DescriptionLen'] = df0.Description.apply(
             lambda x: len(str(x)))
         df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
         df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
         df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
         df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
         df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
         df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
         df0['Interestingness'] = df0.FavouritesCount.truediv(
             df0.StatusesCount)
         df0['BidirFriendship'] = (df0.FriendsCount +
                                   df0.FollowersCount).truediv(
                                       df0.FriendsCount)
         df0['BidirFollowership'] = (df0.FriendsCount +
                                     df0.FollowersCount).truediv(
                                         df0.FollowersCount)
         df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
         df0['CleanTweetsLen'] = df0.CleanTweets.apply(
             lambda x: len(str(x)))
         df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)
         # Remove all RTs, set UserID as index and save relevant files:
         df0 = df0[df0.Retweets.values == False]  # remove retweets
         df0 = df0.set_index('UserID')
         df0 = df0[
             ~df0.index.duplicated()]  # remove duplicates in the tweet
         #df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
         df0.to_csv(data_source[:5] +
                    'all_extracts.csv')  #save all extracts as csv
         with open(data_source[:5] + 'non_English.txt',
                   'w') as d:  # save count of non-English tweets
             d.write('{}'.format(non_english_tweets))
             d.close()
     return df0

示例#10

0

显示文件

def clean_tweet(text):
    # Write a function to clean emojis, hashtags, punctuations & urls
    clean_text = re.sub(r'[^\w\s]', '', p.clean(text))
    return clean_text

示例#11

0

显示文件

            print()
            tweet['orig'] = i.encode('ascii')
            tweet['orig'] = tweet['orig'].decode('ascii')
            print("Original Tweet : " + tweet['orig'])
            tweet['orig'] = re.sub(r'^(b)', '', tweet['orig'])
            print("Original Tweet after removing b: " + tweet['orig'])

            tweet['orig'] = re.sub(r'rt\b', '', tweet['orig'])
            print("Original Tweet after removing rt: " + tweet['orig'])

            tweet['orig'] = re.sub(r'RT\b', '', tweet['orig'])
            print("Original Tweet after removing RT: " + tweet['orig'])

            #cleaning with preprocessor
            p.set_options(p.OPT.MENTION)
            tweet['orig'] = p.clean(tweet['orig'])
            print("Original Tweet after removing mentions: " + tweet['orig'])

            p.set_options(p.OPT.URL)
            tweet['orig'] = p.clean(tweet['orig'])
            print("Original Tweet after removing url: " + tweet['orig'])

            p.set_options(p.OPT.HASHTAG)
            tweet['orig'] = p.clean(tweet['orig'])
            print("Original Tweet after removing hashtags: " + tweet['orig'])

            p.set_options(p.OPT.RESERVED)
            tweet['orig'] = p.clean(tweet['orig'])
            print("Original Tweet after removing reserved: " + tweet['orig'])

            p.set_options(p.OPT.EMOJI)

示例#12

0

显示文件

def preprocess(tweet):
    # using http://grammar.about.com/od/words/a/EnglishContractions.html for reference of contractions
    cont = open("Contractions in English")
    line = cont.readline()
    contractions = {}

    while line:
        line = line.strip()
        line = re.split('\s+', line)
        contraction = line[0]
        expand = ""
        for i in range(1, len(line)):
            expand += line[i] + " "
        contractions[contraction] = expand[:-1]
        line = cont.readline()

    # lowercase tweet and remove all links and EMOJI in it
    input = tweet.lower()
    input = input.strip()
    p.set_options(p.OPT.URL, p.OPT.EMOJI)
    clean = p.clean(input)
    clean = clean.replace("#", "")

    # convert the cleaned input tweet from unicode to python string
    try:
        clean = str(clean)
    except:
        clean = clean
    input = re.split('\s+', clean)

    output = []

    # tokenize tweet
    for i in range(0, len(input)):
        word = input[i]
        # remove all other symbols/characters that are not mentioned
        word = re.sub("[^a-z0-9-.'/,]", '', word).strip()
        if word in contractions:
            match = contractions[word]
            output.extend(re.split('\s+', match))
        elif len(word) > 2 and (word[-2:] == "'s" or word[-2:] == "s'"):
            # handle possessive
            output.append(word[:-2])
            output.append("'s")
        elif isDate(word):
            output.append(word.strip())
        elif re.match(r'\w*,\w*', word):
            # tokenization of ,
            if re.match('\d+,\d+', word):
                # case of number
                output.append(word)
            else:
                word = re.sub(',', '', word).strip()
                if word != "":
                    output.append(word)
        elif re.match(r'\w*\.\w*', word):
            # tokenization of .
            if word == ".":
                continue
            elif re.match(r'\w{2,}\.', word) and word != "mr." and word != "dr." and word != "st." \
                    and word != "oz." and word != "ms." and word != "jr." and word != "sr." and \
                            word != "mt." and word != "no." and word != "sq.":
                # some popular two-characters abbreviations from http://www.englishleap.com/other-resources/abbreviations
                output.append(word[:-1])
            else:
                # take else and all one char ended with a '.' as abbreviations/acronyms
                output.append(word.strip())
        else:
            word = re.sub("[^a-z0-9-.]", ' ', word).strip()
            if word == '':
                continue
            output.append(word)

    # print output
    return output

示例#13

0

显示文件

文件： test_api.py 项目： clementtrebuchet/preprocessor

 def test_clean(self):
     tweet = "Hello there! @pyistanbul #packathon was awesome ðŸ˜€. http://packathon.org"
     cleaned_tweeet = p.clean(tweet)
     self.assertEqual(cleaned_tweeet, 'Hello there! was awesome .')

示例#14

0

显示文件

文件： test_module.py 项目： voraparth1337/tweet-farming-and-cleaning

import preprocessor as p

text = '#MotoG5sPlusLaunch at 12:15 PM today!! Gear up to find your focus & unleash your creativity. http://bit.ly/2ge1cSm!'

a = p.clean(text)
print(a)

parsed_tweet = p.parse(text)

print(parsed_tweet.hashtags)

print(parsed_tweet.urls)

示例#15

0

显示文件

文件： twitter_test_data.py 项目： rajveerbeerda/RumourEval-2019

    writer.writerow(header)

    for dirs in main_dir:
        print(10 * "- - ", dirs, 10 * "- - ")
        dir = os.path.join(path, dirs)
        sub_dir = os.listdir(dir)
        for source_id in sub_dir:
            source_tweet = os.path.join(dir, source_id, "source-tweet",
                                        source_id + ".json")
            with open(source_tweet) as json_file:
                data = json.load(json_file)

            row = [
                "source",
                str(data['id']),
                p.clean(str(data['text'])), data['favorite_count'],
                data['retweet_count']
            ]
            writer.writerow(row)

            replies = os.path.join(dir, source_id, "replies")
            replies_dir = os.listdir(replies)

            for reply in replies_dir:
                tweet = os.path.join(replies, reply)
                reply_id = str(reply)[:-5]

                with open(tweet) as json_file:
                    data = json.load(json_file)
                row = [
                    "reply",

示例#16

0

显示文件

def clean_tweet(tweet: str):
    preprocess = p.clean(tweet)
    preprocess = re.sub(r'[^\w\s]', '', preprocess)

    return preprocess

示例#17

0

显示文件

文件： 13_12.py 项目： sjs521/Intro-to-Python

# Section 13.12 snippets

import preprocessor as p 

p.set_options(p.OPT.URL, p.OPT.RESERVED)

tweet_text = 'RT A sample retweet with a URL https://nasa.gov'

p.clean(tweet_text)





##########################################################################
# (C) Copyright 2019 by Deitel & Associates, Inc. and                    #
# Pearson Education, Inc. All Rights Reserved.                           #
#                                                                        #
# DISCLAIMER: The authors and publisher of this book have used their     #
# best efforts in preparing the book. These efforts include the          #
# development, research, and testing of the theories and programs        #
# to determine their effectiveness. The authors and publisher make       #
# no warranty of any kind, expressed or implied, with regard to these    #
# programs or to the documentation contained in these books. The authors #
# and publisher shall not be liable in any event for incidental or       #
# consequential damages in connection with, or arising out of, the       #
# furnishing, performance, or use of these programs.                     #
##########################################################################

示例#18

0

显示文件

#convert csv to DataFrame
df=js.make_csv("classified_data/all_json.json")
df=df[:500]
print df.shape

#print df.columns
#do preprocessing
#df=pre.xpreprocessing(df)
#df=post_processing(df)
list_pre=[]
for i,row in df.iterrows():
	text=removepunc(row["full_text"])
	text= text.lstrip("RT")
	text=remove_stopwords(text)
	text=remove_numbers(text)
	list_pre.append(p.clean(text))

df["preprocessed_text"]=list_pre


tweets=list(df["preprocessed_text"])
'''
ids=list(df["id_str"])


#print df["preprocessed_text"]

tweets1=list(df["full_text"])

示例#19

0

显示文件

import csv

ACCESS_TOKEN = "2668727876-Yrz4VAyuedncEMFsFRQhy5G8b6ZKbcB9x2G58BU"
ACCESS_TOKEN_SECRET = "LEXRPAoFSKE7oBaqrrZRUBnIbgdoWbZhS5vG2zM2s7Y6j"
CONSUMER_KEY = "l79fswnkaCLeUjXeZzPir9iQU"
CONSUMER_SECRET = "6s1h36BhY9Ypdu7pxDWWSyT2u6mYpex8EUXwKJaewDAtxhsGVq"
t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY)

with open('April1.tsv', 'r') as fin, open('April1_out.tsv', 'w') as fout:

    reader = csv.reader(fin, dialect='excel-tab')
    writer = csv.writer(fout, dialect='excel-tab')
    for row in reader:
        # delete indices in reverse order to avoid shifting earlier indices
        del row[1:]
        writer.writerow(row)
# t hydrate March1_out.tsv > March1.jsonl

with open('April1.csv', mode='w', encoding="utf-8") as corona_file:
    fieldnames = ['date', 'text', 'truncated']
    writer = csv.DictWriter(corona_file, fieldnames=fieldnames)
    writer.writeheader()
    for tweet in t.hydrate(open('April1_out.tsv')):
        p.clean(tweet["full_text"])
        writer.writerow({
            'date': tweet["created_at"],
            'text': tweet["full_text"],
            'truncated': tweet["truncated"]
        })

示例#20

0

显示文件

文件： topic_modeling.py 项目： oliviapy960825/ConspiracyTheory

filename = 'full_text_unprocessed'  #pickle file's name is full_text_unprocessed
outfile = open(filename, 'wb')
pickle.dump(full_text_list, outfile)
outfile.close()

pkl_lst = open('full_text_unprocessed', 'rb')
full_text_list = pickle.load(pkl_lst)

p.set_options(p.OPT.URL, p.OPT.EMOJI)
full_text_list_processed = []
for x in range(len(full_text_list)):
    full_text = full_text_list[x]
    clean = re.compile('<.*?>')
    full_text_processed = re.sub(clean, '', full_text)
    full_text_processed = p.clean(full_text)
    full_text_processed = re.sub('[,\.!?]', '', full_text_processed)
    full_text_processed = re.sub(r'[^a-zA-Z0-9\s]', ' ', full_text_processed)
    full_text_processed = full_text_processed.lower()
    full_text_processed = re.sub("#", "", full_text_processed)
    full_text_list_processed.append(full_text_processed)
# Convert the titles to lowercase
#full_text_list_processed = full_text_list_processed.apply(lambda x: x.lower())# Print out the first rows of papers
print(full_text_list_processed)

filename = 'full_text_processed'  #pickle file's name is full_text_processed
outfile = open(filename, 'wb')
pickle.dump(full_text_list_processed, outfile)
outfile.close()

punc_word = set(punctuation)

示例#21

0

显示文件

文件： main.py 项目： retr0spect/USU-Assignments

def fields_selection_cleaning_and_sentiment(input_file, output_file):
    print 'Pre-processing the tweets! Please wait...'
    writer = csv.writer(open(output_file, 'wb'))
    writer.writerow(['Created_At',
                     'Tweet_Id',
                     'Text',
                     'GeoLocation',
                     'Coordinates',
                     'User_Id',
                     'User_Name',
                     'User_Location',
                     'Language',
                     'Time_Zone',
                     'Country',
                     'Friends_Count',
                     'Followers_Count',
                     'Sentiment_Polarity',
                     'Label',
                     'Location_Id'])
    with open(input_file) as f:
        for line in f:
            data = json.loads(line)
            if data['lang'] == 'en' \
                    and data['user'] is not None \
                    and data['user']['location'] != '':
                if data['place'] is not None:
                    country = data['place']['country']
                else:
                    country = ''
                try:
                    cleaned_tweet = preprocessor.clean(data['text'])
                    location = clean_location(data['user']['location'])[0]
                    location_id = clean_location(data['user']['location'])[1]
                    sentiment = TextBlob(cleaned_tweet).sentiment.polarity
                    if sentiment < 0:
                        label = 0
                    elif sentiment == 0:
                        label = 1
                    else:
                        label = 2
                    if location != 'NA':
                        writer.writerow([data['created_at'],
                                         data['id'],
                                         cleaned_tweet,
                                         data['geo'],
                                         data['coordinates'],
                                         data['user']['id'],
                                         data['user']['name'],
                                         location,
                                         data['lang'],
                                         data['user']['time_zone'],
                                         country,
                                         data['user']['friends_count'],
                                         data['user']['followers_count'],
                                         sentiment,
                                         label,
                                         location_id
                                         ])
                except Exception:
                    pass

    print 'Tweets pre-processing completed.\n'

示例#22

0

显示文件

文件： processTweets.py 项目： ggquinones/HighImpactEventSentimentAnalysis

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
punctuation = [',','.','(',')','//',':',';','<','>','?','!','@']


for filename in os.listdir("tweets"):
	with open('tweets/'+filename) as csv_file:
		with open('processedTweets/'+filename, mode='w') as inFile:
			fw = csv.writer(inFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
			csv_reader = csv.reader(csv_file, delimiter=';')
			line_count = 0
			for row in csv_reader:
				try:
					tweet = p.clean(row[4])			
					word_tokens = word_tokenize(tweet)			
					filtered_sentence = [w.strip() for w in word_tokens if w.strip() not in stop_words]
					filtered_sentence = [w for w in filtered_sentence if "/" not in w]
					filtered_sentence = [w for w in filtered_sentence if "\\" not in w]
					filtered_sentence = [w for w in filtered_sentence if w not in punctuation]	
					ans = []
					for tk in word_tokens:
						if "http" in tk or "https" in tk:
							break
						ans.append(tk)
							
					fw.writerow(ans)
					print("Tweet processed...")
				except:
					continue

示例#23

0

显示文件

import sys
import csv
import preprocessor

reader = csv.reader(sys.stdin)
next(reader)
for orig_tweet, eng_tweet in reader:
    print(preprocessor.clean(orig_tweet))

示例#24

0

显示文件

文件： sent_comparison.py 项目： Charan-7406/Sentiment-Comparison-engine

api = tweepy.API(auth,wait_on_rate_limit=True)

def perc_response(a):
  p_a = 100 * (len(a) / 57)
  return round(p_a, 2)

search_input = input("Enter your first keyword/hash you want to analyze:")
search_input1 = input("Enter your second keyword/hash you want to analyze:")

all_tweets = []
for tweet in tweepy.Cursor(api.search, q=search_input, tweet_mode='extended', lang="en", result_type='recent').items(57):
  all_tweets.append(tweet.full_text)

tweets_clean = []
for tweet in all_tweets:
    tweet = p.clean(tweet)
    tweet = ' '.join(re.sub(':', ' ', tweet).split())
    tweets_clean.append(tweet)

positive_l = []
negative_l = []
neutral_l = []

for tweet in tweets_clean:
  if analyser.polarity_scores(tweet).get('compound') >= 0.05:
    positive_l.append(tweet)
  elif analyser.polarity_scores(tweet).get('compound') <= -0.05:
    negative_l.append(tweet)
  else:
    neutral_l.append(tweet)

示例#25

0

显示文件

文件： test_api.py 项目： s/preprocessor

 def test_clean(self):
     tweet = "Hello there! @pyistanbul #packathon was awesome ðŸ˜€. http://packathon.org"
     p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
     cleaned_tweeet = p.clean(tweet)
     self.assertEqual(cleaned_tweeet, "Hello there! was awesome .")

示例#26

0

显示文件

文件： preprocessing.py 项目： minteemer/twitter-topic-clustering

def preprocess_tweet(text):
    return tweet_prep.clean(BeautifulSoup(text, 'lxml').text)

示例#27

0

显示文件

def preprocess_tweet(tweet):
    return p.clean(tweet)

示例#28

0

显示文件

文件： DataProcessing.py 项目： TavonH/WebScienceCoursework

def clean_tweets(tweet):
    #Remove URLs, mentions, emojis
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI)
    clean_tweet = p.clean(tweet)
    clean_tweet = re.sub(r':', '', clean_tweet)
    return clean_tweet

示例#29

0

显示文件

def readTweet(hermes, intentMessage):
	drumpf = ts.query.query_tweets_from_user(intentMessage.slots.User[0].slot_value.value.value,1)
	tweet = drumpf[0].text
	name = drumpf[0].fullname
	return name + ' said ' + p.clean(tweet.encode('utf-8'))

示例#30

0

显示文件

文件： hashtag_tweetdata_sentimental.py 项目： subhendu01/twitter-sentiment-handler

def tweet_data():
    try:
        # Open/Create a file to append data
        # If the file exists, then read the existing data from the CSV file.
        file_name = "tourism_" + datetime.now().strftime(
            "%d-%b-%Y") + "_data.csv"

        COLS = [
            'created_at', 'id', 'send_by', 'tweet_url', 'original_text',
            'trans', 'process', 'priority', 'type'
        ]

        if os.path.exists(file_name):
            df = pd.read_csv(file_name, header=0)
            pre_id = max(df["id"])
            print(pre_id)
        else:
            pre_id = 0
            df = pd.DataFrame(columns=COLS)
            print(pre_id)
        hndlr_lst = twitter_credential.handler_list
        # new_entry = []
        for name in hndlr_lst:
            for tweet in tweepy.Cursor(
                    api.search,
                    q=name,
                    count=100,
                    # lang="en",
                    since=datetime.now().strftime("%Y-%m-%d"),
                    since_id=pre_id,
                    # max_id = pre_id
                    # until= datetime.now().strftime("%Y-%m-%d")
            ).items():

                # # tweet URL
                tweet_url = f"https://twitter.com/" + tweet.user.screen_name + "/status/" + str(
                    tweet.id)

                # google tranglater
                translator = Translator()
                trans = translator.translate(tweet.text).text

                # cleaning data
                process = p.clean(trans)
                process = re.sub(r':', '', process)
                process = re.sub(r'‚Ä¶', '', process)

                # vader
                sen_analyser = SentimentIntensityAnalyzer()
                polarity_scores = sen_analyser.polarity_scores(process)
                print(tweet.id)
                compnd = polarity_scores['compound']
                if compnd >= 0.05:
                    polarity = polarity_scores['pos']
                    polarity_type = "positive"
                elif compnd <= -0.05:
                    polarity = polarity_scores['neg']
                    polarity_type = "negative"
                else:
                    polarity = polarity_scores['neu']
                    polarity_type = "neutral"

                new_entry = [
                    tweet.created_at, tweet.id, tweet.user.screen_name,
                    tweet_url, tweet.text, trans, process, polarity,
                    polarity_type
                ]
                # print(new_entry)

                single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
                df_final = df.append(single_tweet_df, ignore_index=True)
                df = pd.DataFrame(data=df_final, columns=COLS)
                df.to_csv(file_name)

        # print("Got all the tweet.")
    except tweepy.TweepError as e:
        print(str(e))
        print("Something went wrong.")