file for testing things before adding them to pipeline """ import time from tweepy import Stream from tweepy import OAuthHandler from tweepy.streaming import StreamListener import io import os import json from keys import consumer_secret,consumer_key,access_token_secret,access_token import accessStream import cleanText #from main import keyword_list df = cleanText.csvToPandasDF("tweets_out.csv") #print df.head(1) #print df.shape df.insert(0, 'uID', range(0, df.shape[0])) #print df.head(4) #print df.shape #keyword_list = ['bernie, sanders, hillary, clinton'] keyword_list = ['bernie, sanders, hillary, clinton'] keywords = keyword_list[0].replace(',', '').split() #keywords = keyword_list[0].replace(',', '').split() #print keywords
print("starting capturing stream") #twitterStream = Stream(auth, accessStream.Listener(start_time, time_limit,inputFile)) twitterStream = Stream(auth, accessStream.Listener(tweet_limit, start_time, time_limit, inputFile)) #WHY THE F**K DOESN'T THIS WORK twitterStream.filter(track=keyword_list) #call the filter method to run the Stream Listener print("done capturing stream") print("cleaning tweets") cleanText.jsonUTF8toCsv(inputFile, outputFile) print("tweets cleaned to CSV. CSV created") cleanDF = cleanText.csvToPandasDF(outputFile) print("data frame created.") print("identifying subject of the tweet") textList = cleanDF.loc[:, ['text']] subject_array = [] clean_text = [] #keywords = keyword_list[0].replace(',', '').split() this can only be used in one place or only first word comes through. for i in xrange(len(textList)): subject_array.append(cleanText.identifySubject(textList['text'][i], keyword_list))
file for testing things before adding them to pipeline """ import time from tweepy import Stream from tweepy import OAuthHandler from tweepy.streaming import StreamListener import io import os import json from keys import consumer_secret, consumer_key, access_token_secret, access_token import accessStream import cleanText #from main import keyword_list df = cleanText.csvToPandasDF("tweets_out.csv") #print df.head(1) #print df.shape df.insert(0, 'uID', range(0, df.shape[0])) #print df.head(4) #print df.shape #keyword_list = ['bernie, sanders, hillary, clinton'] keyword_list = ['bernie, sanders, hillary, clinton'] keywords = keyword_list[0].replace(',', '').split() #keywords = keyword_list[0].replace(',', '').split() #print keywords
print("starting capturing stream") #twitterStream = Stream(auth, accessStream.Listener(start_time, time_limit,inputFile)) twitterStream = Stream( auth, accessStream.Listener(tweet_limit, start_time, time_limit, inputFile)) #WHY THE F**K DOESN'T THIS WORK twitterStream.filter( track=keyword_list) #call the filter method to run the Stream Listener print("done capturing stream") print("cleaning tweets") cleanText.jsonUTF8toCsv(inputFile, outputFile) print("tweets cleaned to CSV. CSV created") cleanDF = cleanText.csvToPandasDF(outputFile) print("data frame created.") print("identifying subject of the tweet") textList = cleanDF.loc[:, ['text']] subject_array = [] clean_text = [] #keywords = keyword_list[0].replace(',', '').split() this can only be used in one place or only first word comes through. for i in xrange(len(textList)): subject_array.append( cleanText.identifySubject(textList['text'][i], keyword_list)) clean_text.append(