def set_sentence_(sentence): p.set_options(p.OPT.URL, p.OPT.EMOJI) sentence=p.clean(sentence) sentence=hashtag_power(sentence) p.set_options(p.OPT.HASHTAG) sentence=p.clean(sentence) sentence=punc(sentence) sentence=Enleve_Accents(sentence) return sentence
def get_data(uuid, server, port, title, start, end): # URL of the dataset url = "http://" + server + ":" + port + "/api/data/uuid/" + uuid + "?endtime=" + str(int(end)*1000) + \ "&starttime=" + str(int(start)*1000) + "&limit=-1" # Request for the data in json format print url try: r = requests.get(url) except Exception as e: print e sys.exit(1) archiver_data = r.json() if len(archiver_data[0]['Readings']) == 0: sys.exit(1) # Convert to dataframe df = pandas.DataFrame(archiver_data[0]['Readings'], columns=["Timestamp",title]) df = df.set_index('Timestamp') df.index = pandas.to_datetime(df.index, unit='ms') df = df.tz_localize("UTC").tz_convert("Asia/Kolkata") df = clean(df, title) # Return series having datetimeindex to resample and further processing return df
def action(self, tweets_list): corpus = [] for t in tweets_list: tweet = t["text"] tweet = p.clean(tweet.encode("utf-8")) #tweet = set_sentence(tweet.encode("utf-8")) s=get_categorie_([tweet]) if (s != Nocat): corpus.append(tweet) t["cat"]=s print tweet print t
def get_twitter(): import pandas as pd import preprocessor as p with open("2014_india_floods.csv") as fl: data = pd.read_csv(fl) tweet_id = data['tweet_id'].tolist() text = data['tweet_text'].tolist() text = [p.clean(t) for t in text] labels = data['choose_one_category'].tolist() return (text, labels)
def __init__(self, program, mode = "MIPS"): super(Assembler, self).__init__() try: text = program.read() except AttributeError: text = program self.mode = mode.upper() self.registers = Registers(self.mode) lines = text.split("\n") lines = clean(lines, self.mode) instrs, data = split_sections(lines) self.memory = Memory() for d in data: self.memory.insert(d) instrs = preprocess(instrs, self.mode) self.labels = label_positions(instrs) self.instructions = [Instruction(instr) for instr in instrs]
def clean_tweet(self, tweet_text): """ Taking a raw tweet, return a cleaned list of tweets tokens :param tweet_text: :return: array of tokens words """ tweet = preprocessor.clean(tweet_text) tokens = [word[1:] if word.startswith('#') else word for word in tweet.split(' ')] tokens = self.replace_abbreviations(tokens) tokens = self.remove_stopwords_spelling_mistakes(tokens) tokens = gensim.utils.simple_preprocess(' '.join(tokens)) return tokens
tweet_senti = [] tweet_dict = {} ptweet=0 ntweet=0 ntrl=0 flag=0 connection = MongoClient() db = connection.indian_national_congress posts = db.bjp count = posts.find().count() print (count) p.set_options(p.OPT.RESERVED, p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.NUMBER, p.OPT.SMILEY, p.OPT.EMOJI) # remove smiley and emoji if they are to be included in final text for d in posts.find({},{'text':1,'_id': False}): tmp_item = str(d) tmp_list.append(p.clean(tmp_item)) for item in tmp_list: item=item.strip() item=item.replace('RT','') item = item.replace('\\n', '') item = " ".join(re.findall("[a-zA-Z]+",item)) tmp_var = re.sub(r'^\S*\s', '', item) clean_list.append(tmp_var) df= pd.DataFrame(clean_list,columns = ["text"]) df.to_csv('inc.csv', index=False)
def write_tweets(keyword, file): # If the file exists, then read the existing data from the CSV file. if os.path.exists(file): df = pd.read_csv(file, header=0) else: df = pd.DataFrame(columns=COLS) #page attribute in tweepy.cursor and iteration for page in tweepy.Cursor(api.search, q=keyword, count=200, include_rts=False, since=start_date).pages(50): for status in page: new_entry = [] status = status._json ## check whether the tweet is in english or skip to the next tweet if status['lang'] != 'en': continue #when run the code, below code replaces the retweet amount and #no of favorires that are changed since last download. if status['created_at'] in df['created_at'].values: i = df.loc[df['created_at'] == status['created_at']].index[0] if status['favorite_count'] != df.at[i, 'favorite_count'] or \ status['retweet_count'] != df.at[i, 'retweet_count']: df.at[i, 'favorite_count'] = status['favorite_count'] df.at[i, 'retweet_count'] = status['retweet_count'] continue #tweepy preprocessing called for basic preprocessing clean_text = p.clean(status['text']) #call clean_tweet method for extra preprocessing filtered_tweet = clean_tweets(clean_text) #pass textBlob method for sentiment calculations blob = TextBlob(filtered_tweet) Sentiment = blob.sentiment #seperate polarity and subjectivity in to two variables polarity = Sentiment.polarity subjectivity = Sentiment.subjectivity #new entry append new_entry += [ status['id'], status['created_at'], status['source'], status['text'], filtered_tweet, Sentiment, polarity, subjectivity, status['lang'], status['favorite_count'], status['retweet_count'] ] #to append original author of the tweet new_entry.append(status['user']['screen_name']) try: is_sensitive = status['possibly_sensitive'] except KeyError: is_sensitive = None new_entry.append(is_sensitive) # hashtagas and mentiones are saved using comma separted hashtags = ", ".join([ hashtag_item['text'] for hashtag_item in status['entities']['hashtags'] ]) new_entry.append(hashtags) mentions = ", ".join([ mention['screen_name'] for mention in status['entities']['user_mentions'] ]) new_entry.append(mentions) #get location of the tweet if possible try: location = status['user']['location'] except TypeError: location = '' new_entry.append(location) try: coordinates = [ coord for loc in status['place']['bounding_box']['coordinates'] for coord in loc ] except TypeError: coordinates = None new_entry.append(coordinates) single_tweet_df = pd.DataFrame([new_entry], columns=COLS) df = df.append(single_tweet_df, ignore_index=True) csvFile = open(file, 'a', encoding='utf-8') df.to_csv(csvFile, mode='a', columns=COLS, index=False, encoding="utf-8")
def extractor( self, data_source ): # accept list of files consisting of raw tweets in form of json object #self.data_source = data_source data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\ 'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\ 'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\ 'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]} non_english_tweets = 0 # keep track of the non-English tweets with codecs.open( self.data_source, 'r') as f: # data_source is read from extractor() function for line in f.readlines(): non_English = 0 try: line = json.loads(line) if line['lang'] in [ 'en', 'en-gb', 'en-GB', 'en-AU', 'en-IN', 'en_US' ]: data_extracts['Language'].append(line['user']['lang']) data_extracts['TweetID'].append(line['id_str']) data_extracts['RawTweets'].append(line['text']) data_extracts['CleanTweets'].append( p.clean(line['text'])) data_extracts['CreatedAt'].append(line['created_at']) data_extracts['AccountCreated'].append( line['user']['created_at']) data_extracts['ScreenName'].append( line['user']['screen_name']) data_extracts['RetweetCount'].append( line['retweet_count']) data_extracts['FollowersCount'].append( line['user']['followers_count']) data_extracts['FriendsCount'].append( line['user']['friends_count']) data_extracts['StatusesCount'].append( line['user']['statuses_count']) data_extracts['FavouritesCount'].append( line['user']['favourites_count']) data_extracts['UserName'].append(line['user']['name']) data_extracts['Location'].append( line['user']['location']) data_extracts['Description'].append( line['user']['description']) data_extracts['UserURL'].append(line['user']['url']) data_extracts['VerifiedAccount'].append( line['user']['verified']) data_extracts['UserID'].append(line['user']['id']) ################ data_extracts['TimeZone'].append( line['user']['time_zone']) data_extracts['TweetFavouriteCount'].append( line['favorite_count']) #extracts['Coordinates'].append(line['coordinates']) else: non_english_tweets += 1 except: continue df0 = pd.DataFrame( data_extracts) #convert data extracts to pandas DataFrame df0['CreatedAt'] = pd.to_datetime( data_extracts['CreatedAt'], errors='coerce') # convert to datetime df0['AccountCreated'] = pd.to_datetime( data_extracts['AccountCreated'], errors='coerce') df0 = df0.dropna(subset=['AccountCreated', 'CreatedAt']) # drop na in datetime AccountAge = [] # compute the account age of accounts date_format = "%Y-%m-%d %H:%M:%S" for dr, dc in zip(df0.CreatedAt, df0.AccountCreated): #try: dr = str(dr) dc = str(dc) d1 = datetime.strptime(dr, date_format) d2 = datetime.strptime(dc, date_format) dif = d1 - d2 AccountAge.append(dif.days) #except: # continue df0['AccountAge'] = AccountAge # add/define additional features ... df0['Retweets'] = df0.RawTweets.apply( lambda x: str(x).split()[0] == 'RT') df0['RawTweetsLen'] = df0.RawTweets.apply( lambda x: len(str(x))) # modified df0['DescriptionLen'] = df0.Description.apply( lambda x: len(str(x))) df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x))) df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x))) df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x))) df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge) df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount) df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount) df0['Interestingness'] = df0.FavouritesCount.truediv( df0.StatusesCount) df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv( df0.FriendsCount) df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv( df0.FollowersCount) df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen) df0['CleanTweetsLen'] = df0.CleanTweets.apply( lambda x: len(str(x))) df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen) # Remove all RTs, set UserID as index and save relevant files: df0 = df0[df0.Retweets.values == False] # remove retweets df0 = df0.set_index('UserID') df0 = df0[ ~df0.index.duplicated()] # remove duplicates in the tweet #df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv df0.to_csv(data_source[:5] + 'all_extracts.csv') #save all extracts as csv with open(data_source[:5] + 'non_English.txt', 'w') as d: # save count of non-English tweets d.write('{}'.format(non_english_tweets)) d.close() return df0
def clean_tweet(text): # Write a function to clean emojis, hashtags, punctuations & urls clean_text = re.sub(r'[^\w\s]', '', p.clean(text)) return clean_text
print() tweet['orig'] = i.encode('ascii') tweet['orig'] = tweet['orig'].decode('ascii') print("Original Tweet : " + tweet['orig']) tweet['orig'] = re.sub(r'^(b)', '', tweet['orig']) print("Original Tweet after removing b: " + tweet['orig']) tweet['orig'] = re.sub(r'rt\b', '', tweet['orig']) print("Original Tweet after removing rt: " + tweet['orig']) tweet['orig'] = re.sub(r'RT\b', '', tweet['orig']) print("Original Tweet after removing RT: " + tweet['orig']) #cleaning with preprocessor p.set_options(p.OPT.MENTION) tweet['orig'] = p.clean(tweet['orig']) print("Original Tweet after removing mentions: " + tweet['orig']) p.set_options(p.OPT.URL) tweet['orig'] = p.clean(tweet['orig']) print("Original Tweet after removing url: " + tweet['orig']) p.set_options(p.OPT.HASHTAG) tweet['orig'] = p.clean(tweet['orig']) print("Original Tweet after removing hashtags: " + tweet['orig']) p.set_options(p.OPT.RESERVED) tweet['orig'] = p.clean(tweet['orig']) print("Original Tweet after removing reserved: " + tweet['orig']) p.set_options(p.OPT.EMOJI)
def preprocess(tweet): # using http://grammar.about.com/od/words/a/EnglishContractions.html for reference of contractions cont = open("Contractions in English") line = cont.readline() contractions = {} while line: line = line.strip() line = re.split('\s+', line) contraction = line[0] expand = "" for i in range(1, len(line)): expand += line[i] + " " contractions[contraction] = expand[:-1] line = cont.readline() # lowercase tweet and remove all links and EMOJI in it input = tweet.lower() input = input.strip() p.set_options(p.OPT.URL, p.OPT.EMOJI) clean = p.clean(input) clean = clean.replace("#", "") # convert the cleaned input tweet from unicode to python string try: clean = str(clean) except: clean = clean input = re.split('\s+', clean) output = [] # tokenize tweet for i in range(0, len(input)): word = input[i] # remove all other symbols/characters that are not mentioned word = re.sub("[^a-z0-9-.'/,]", '', word).strip() if word in contractions: match = contractions[word] output.extend(re.split('\s+', match)) elif len(word) > 2 and (word[-2:] == "'s" or word[-2:] == "s'"): # handle possessive output.append(word[:-2]) output.append("'s") elif isDate(word): output.append(word.strip()) elif re.match(r'\w*,\w*', word): # tokenization of , if re.match('\d+,\d+', word): # case of number output.append(word) else: word = re.sub(',', '', word).strip() if word != "": output.append(word) elif re.match(r'\w*\.\w*', word): # tokenization of . if word == ".": continue elif re.match(r'\w{2,}\.', word) and word != "mr." and word != "dr." and word != "st." \ and word != "oz." and word != "ms." and word != "jr." and word != "sr." and \ word != "mt." and word != "no." and word != "sq.": # some popular two-characters abbreviations from http://www.englishleap.com/other-resources/abbreviations output.append(word[:-1]) else: # take else and all one char ended with a '.' as abbreviations/acronyms output.append(word.strip()) else: word = re.sub("[^a-z0-9-.]", ' ', word).strip() if word == '': continue output.append(word) # print output return output
def test_clean(self): tweet = "Hello there! @pyistanbul #packathon was awesome 😀. http://packathon.org" cleaned_tweeet = p.clean(tweet) self.assertEqual(cleaned_tweeet, 'Hello there! was awesome .')
import preprocessor as p text = '#MotoG5sPlusLaunch at 12:15 PM today!! Gear up to find your focus & unleash your creativity. http://bit.ly/2ge1cSm!' a = p.clean(text) print(a) parsed_tweet = p.parse(text) print(parsed_tweet.hashtags) print(parsed_tweet.urls)
writer.writerow(header) for dirs in main_dir: print(10 * "- - ", dirs, 10 * "- - ") dir = os.path.join(path, dirs) sub_dir = os.listdir(dir) for source_id in sub_dir: source_tweet = os.path.join(dir, source_id, "source-tweet", source_id + ".json") with open(source_tweet) as json_file: data = json.load(json_file) row = [ "source", str(data['id']), p.clean(str(data['text'])), data['favorite_count'], data['retweet_count'] ] writer.writerow(row) replies = os.path.join(dir, source_id, "replies") replies_dir = os.listdir(replies) for reply in replies_dir: tweet = os.path.join(replies, reply) reply_id = str(reply)[:-5] with open(tweet) as json_file: data = json.load(json_file) row = [ "reply",
def clean_tweet(tweet: str): preprocess = p.clean(tweet) preprocess = re.sub(r'[^\w\s]', '', preprocess) return preprocess
# Section 13.12 snippets import preprocessor as p p.set_options(p.OPT.URL, p.OPT.RESERVED) tweet_text = 'RT A sample retweet with a URL https://nasa.gov' p.clean(tweet_text) ########################################################################## # (C) Copyright 2019 by Deitel & Associates, Inc. and # # Pearson Education, Inc. All Rights Reserved. # # # # DISCLAIMER: The authors and publisher of this book have used their # # best efforts in preparing the book. These efforts include the # # development, research, and testing of the theories and programs # # to determine their effectiveness. The authors and publisher make # # no warranty of any kind, expressed or implied, with regard to these # # programs or to the documentation contained in these books. The authors # # and publisher shall not be liable in any event for incidental or # # consequential damages in connection with, or arising out of, the # # furnishing, performance, or use of these programs. # ##########################################################################
#convert csv to DataFrame df=js.make_csv("classified_data/all_json.json") df=df[:500] print df.shape #print df.columns #do preprocessing #df=pre.xpreprocessing(df) #df=post_processing(df) list_pre=[] for i,row in df.iterrows(): text=removepunc(row["full_text"]) text= text.lstrip("RT") text=remove_stopwords(text) text=remove_numbers(text) list_pre.append(p.clean(text)) df["preprocessed_text"]=list_pre tweets=list(df["preprocessed_text"]) ''' ids=list(df["id_str"]) #print df["preprocessed_text"] tweets1=list(df["full_text"])
import csv ACCESS_TOKEN = "2668727876-Yrz4VAyuedncEMFsFRQhy5G8b6ZKbcB9x2G58BU" ACCESS_TOKEN_SECRET = "LEXRPAoFSKE7oBaqrrZRUBnIbgdoWbZhS5vG2zM2s7Y6j" CONSUMER_KEY = "l79fswnkaCLeUjXeZzPir9iQU" CONSUMER_SECRET = "6s1h36BhY9Ypdu7pxDWWSyT2u6mYpex8EUXwKJaewDAtxhsGVq" t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY) with open('April1.tsv', 'r') as fin, open('April1_out.tsv', 'w') as fout: reader = csv.reader(fin, dialect='excel-tab') writer = csv.writer(fout, dialect='excel-tab') for row in reader: # delete indices in reverse order to avoid shifting earlier indices del row[1:] writer.writerow(row) # t hydrate March1_out.tsv > March1.jsonl with open('April1.csv', mode='w', encoding="utf-8") as corona_file: fieldnames = ['date', 'text', 'truncated'] writer = csv.DictWriter(corona_file, fieldnames=fieldnames) writer.writeheader() for tweet in t.hydrate(open('April1_out.tsv')): p.clean(tweet["full_text"]) writer.writerow({ 'date': tweet["created_at"], 'text': tweet["full_text"], 'truncated': tweet["truncated"] })
filename = 'full_text_unprocessed' #pickle file's name is full_text_unprocessed outfile = open(filename, 'wb') pickle.dump(full_text_list, outfile) outfile.close() pkl_lst = open('full_text_unprocessed', 'rb') full_text_list = pickle.load(pkl_lst) p.set_options(p.OPT.URL, p.OPT.EMOJI) full_text_list_processed = [] for x in range(len(full_text_list)): full_text = full_text_list[x] clean = re.compile('<.*?>') full_text_processed = re.sub(clean, '', full_text) full_text_processed = p.clean(full_text) full_text_processed = re.sub('[,\.!?]', '', full_text_processed) full_text_processed = re.sub(r'[^a-zA-Z0-9\s]', ' ', full_text_processed) full_text_processed = full_text_processed.lower() full_text_processed = re.sub("#", "", full_text_processed) full_text_list_processed.append(full_text_processed) # Convert the titles to lowercase #full_text_list_processed = full_text_list_processed.apply(lambda x: x.lower())# Print out the first rows of papers print(full_text_list_processed) filename = 'full_text_processed' #pickle file's name is full_text_processed outfile = open(filename, 'wb') pickle.dump(full_text_list_processed, outfile) outfile.close() punc_word = set(punctuation)
def fields_selection_cleaning_and_sentiment(input_file, output_file): print 'Pre-processing the tweets! Please wait...' writer = csv.writer(open(output_file, 'wb')) writer.writerow(['Created_At', 'Tweet_Id', 'Text', 'GeoLocation', 'Coordinates', 'User_Id', 'User_Name', 'User_Location', 'Language', 'Time_Zone', 'Country', 'Friends_Count', 'Followers_Count', 'Sentiment_Polarity', 'Label', 'Location_Id']) with open(input_file) as f: for line in f: data = json.loads(line) if data['lang'] == 'en' \ and data['user'] is not None \ and data['user']['location'] != '': if data['place'] is not None: country = data['place']['country'] else: country = '' try: cleaned_tweet = preprocessor.clean(data['text']) location = clean_location(data['user']['location'])[0] location_id = clean_location(data['user']['location'])[1] sentiment = TextBlob(cleaned_tweet).sentiment.polarity if sentiment < 0: label = 0 elif sentiment == 0: label = 1 else: label = 2 if location != 'NA': writer.writerow([data['created_at'], data['id'], cleaned_tweet, data['geo'], data['coordinates'], data['user']['id'], data['user']['name'], location, data['lang'], data['user']['time_zone'], country, data['user']['friends_count'], data['user']['followers_count'], sentiment, label, location_id ]) except Exception: pass print 'Tweets pre-processing completed.\n'
nltk.download('stopwords') nltk.download('punkt') stop_words = set(stopwords.words('english')) punctuation = [',','.','(',')','//',':',';','<','>','?','!','@'] for filename in os.listdir("tweets"): with open('tweets/'+filename) as csv_file: with open('processedTweets/'+filename, mode='w') as inFile: fw = csv.writer(inFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_reader = csv.reader(csv_file, delimiter=';') line_count = 0 for row in csv_reader: try: tweet = p.clean(row[4]) word_tokens = word_tokenize(tweet) filtered_sentence = [w.strip() for w in word_tokens if w.strip() not in stop_words] filtered_sentence = [w for w in filtered_sentence if "/" not in w] filtered_sentence = [w for w in filtered_sentence if "\\" not in w] filtered_sentence = [w for w in filtered_sentence if w not in punctuation] ans = [] for tk in word_tokens: if "http" in tk or "https" in tk: break ans.append(tk) fw.writerow(ans) print("Tweet processed...") except: continue
import sys import csv import preprocessor reader = csv.reader(sys.stdin) next(reader) for orig_tweet, eng_tweet in reader: print(preprocessor.clean(orig_tweet))
api = tweepy.API(auth,wait_on_rate_limit=True) def perc_response(a): p_a = 100 * (len(a) / 57) return round(p_a, 2) search_input = input("Enter your first keyword/hash you want to analyze:") search_input1 = input("Enter your second keyword/hash you want to analyze:") all_tweets = [] for tweet in tweepy.Cursor(api.search, q=search_input, tweet_mode='extended', lang="en", result_type='recent').items(57): all_tweets.append(tweet.full_text) tweets_clean = [] for tweet in all_tweets: tweet = p.clean(tweet) tweet = ' '.join(re.sub(':', ' ', tweet).split()) tweets_clean.append(tweet) positive_l = [] negative_l = [] neutral_l = [] for tweet in tweets_clean: if analyser.polarity_scores(tweet).get('compound') >= 0.05: positive_l.append(tweet) elif analyser.polarity_scores(tweet).get('compound') <= -0.05: negative_l.append(tweet) else: neutral_l.append(tweet)
def test_clean(self): tweet = "Hello there! @pyistanbul #packathon was awesome 😀. http://packathon.org" p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) cleaned_tweeet = p.clean(tweet) self.assertEqual(cleaned_tweeet, "Hello there! was awesome .")
def preprocess_tweet(text): return tweet_prep.clean(BeautifulSoup(text, 'lxml').text)
def preprocess_tweet(tweet): return p.clean(tweet)
def clean_tweets(tweet): #Remove URLs, mentions, emojis p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI) clean_tweet = p.clean(tweet) clean_tweet = re.sub(r':', '', clean_tweet) return clean_tweet
def readTweet(hermes, intentMessage): drumpf = ts.query.query_tweets_from_user(intentMessage.slots.User[0].slot_value.value.value,1) tweet = drumpf[0].text name = drumpf[0].fullname return name + ' said ' + p.clean(tweet.encode('utf-8'))
def tweet_data(): try: # Open/Create a file to append data # If the file exists, then read the existing data from the CSV file. file_name = "tourism_" + datetime.now().strftime( "%d-%b-%Y") + "_data.csv" COLS = [ 'created_at', 'id', 'send_by', 'tweet_url', 'original_text', 'trans', 'process', 'priority', 'type' ] if os.path.exists(file_name): df = pd.read_csv(file_name, header=0) pre_id = max(df["id"]) print(pre_id) else: pre_id = 0 df = pd.DataFrame(columns=COLS) print(pre_id) hndlr_lst = twitter_credential.handler_list # new_entry = [] for name in hndlr_lst: for tweet in tweepy.Cursor( api.search, q=name, count=100, # lang="en", since=datetime.now().strftime("%Y-%m-%d"), since_id=pre_id, # max_id = pre_id # until= datetime.now().strftime("%Y-%m-%d") ).items(): # # tweet URL tweet_url = f"https://twitter.com/" + tweet.user.screen_name + "/status/" + str( tweet.id) # google tranglater translator = Translator() trans = translator.translate(tweet.text).text # cleaning data process = p.clean(trans) process = re.sub(r':', '', process) process = re.sub(r'…', '', process) # vader sen_analyser = SentimentIntensityAnalyzer() polarity_scores = sen_analyser.polarity_scores(process) print(tweet.id) compnd = polarity_scores['compound'] if compnd >= 0.05: polarity = polarity_scores['pos'] polarity_type = "positive" elif compnd <= -0.05: polarity = polarity_scores['neg'] polarity_type = "negative" else: polarity = polarity_scores['neu'] polarity_type = "neutral" new_entry = [ tweet.created_at, tweet.id, tweet.user.screen_name, tweet_url, tweet.text, trans, process, polarity, polarity_type ] # print(new_entry) single_tweet_df = pd.DataFrame([new_entry], columns=COLS) df_final = df.append(single_tweet_df, ignore_index=True) df = pd.DataFrame(data=df_final, columns=COLS) df.to_csv(file_name) # print("Got all the tweet.") except tweepy.TweepError as e: print(str(e)) print("Something went wrong.")