def clean(self, tweets): for tw in tweets: count = 0 for t in tweets[tw]: norm = Normalizer() stp = StpRemoval() t['text_clean'] = t['text'].encode('utf-8', errors='ignore') t['text_clean'] = t['text_clean'].translate(string.maketrans(string.punctuation, ' ' * len(string.punctuation))) text = norm.normalize(t['text_clean']) text = stp.removeStp(t['text_clean']) tweets[tw][count]['text_clean'] = text.lower() count = count + 1 return tweets
# Author : Alfan F. Wicaksono # IR Lab, FASILKOM, UI # Script for pre-processing twitter corpus from normalizer import Normalizer from stpremoval import StpRemoval ##################### you can modify this part ###################### corpusFile = "debatcapres_2014_sesi1.txt" preprocessedFile = "debatcapres_2014_sesi1_processed.txt" ##################################################################### nm = Normalizer() sw = StpRemoval() fin = open(corpusFile, "r") fout = open(preprocessedFile, "w") for line in fin: line = line.strip() # remove carriage return line = nm.normalize(line) # normalization line = sw.removeStp(line) # remove stop word fout.write(line) # put preprocessed tweet on the new file fout.write("\n") fin.close() fout.close()
tweet.append(line) #store in dataframe prab_data = DataFrame(tweet) #rename the column prab_data.columns = ['tweet'] #score the tweet score = [] #create normalizer object norm = Normalizer() #create Stop word Removal object st = StpRemoval() #create sentiment analysis object s = Sentianal() for i in range(0, len(prab_data)): #normalize line = norm.normalize(prab_data['tweet'][i]) #remove stopword line = st.removeStp(line) #score sentiment score.append(s.compute(line)) #join the dataframe
tweet.append(line) #store in dataframe prab_data = DataFrame(tweet) #rename the column prab_data.columns = ['tweet'] #score the tweet score = [] #create normalizer object norm = Normalizer() #create Stop word Removal object st = StpRemoval() #create sentiment analysis object s = Sentianal() for i in range(0,len(prab_data)): #normalize line = norm.normalize(prab_data['tweet'][i]) #remove stopword line = st.removeStp(line) #score sentiment score.append(s.compute(line)) #join the dataframe