def classifyTweets(tweetFile, history, tag, sSaveFile, offset=3): stopWords = getStopWords() tweets = [] for line in IO.readData_by_line(tweetFile): tweet = Tweet.Tweet() tweet.setTweet(line) if(tweet.containsTag("#" + tag)): stamp = tweet.date + timedelta(days=offset) if stamp.date() in history: tweet.label = history[stamp.date()] tweet.removeStopWords(stopWords) tweets.append(tweet) print len(tweets) tweetFile.close() IO.writeTweets(sSaveFile, tweets, ['label', 'trimmedMessage'])
def classifyTweets(tweetFile, history, tag, sSaveFile, offset=3): stopWords = getStopWords() tweets = [] for line in IO.readData_by_line(tweetFile): tweet = Tweet.Tweet() tweet.setTweet(line) if (tweet.containsTag("#" + tag)): stamp = tweet.date + timedelta(days=offset) if stamp.date() in history: tweet.label = history[stamp.date()] tweet.removeStopWords(stopWords) tweets.append(tweet) print len(tweets) tweetFile.close() IO.writeTweets(sSaveFile, tweets, ['label', 'trimmedMessage'])
def classifyTweetsCompany(tag, _offset=3): tweetFile = open("data/scrapeCompanies.txt") priceData = IO.readData("data/" + tag + ".csv", ',') priceIter = iter(priceData) next(priceIter) priceHist = priceHistory(priceIter, "%Y-%m-%d", 2) classifyTweets(tweetFile, priceHist, tag, "data/Classified" + tag + ".txt", offset=_offset)
def classifyTweetsDJIA(_offset=3): tweetFile = open("data/scrapeDJIA.txt") priceData = IO.readData("data/DJIA.tsv") priceHist = priceHistory(priceData, "%b %d, %Y", 1) classifyTweets(tweetFile, priceHist, "DJIA", "data/ClassifiedDJIA.txt", offset=_offset)
def setTweetsEmotion(): #Get Emotions arrEmo = getEmotions() #Analyse Tweet Emotion data = open("data/scrapeCompanies.txt") #Read every tweet for line in IO.readData_by_line(data): tweet = Tweet.Tweet() tweet.setTweet(line) #Check every emotion value = 0 for emo in arrEmo: word = emo[0] if word in tweet.message: #Update value by emotion if emo[1] == "1": value = 1 else: value = -1 if(value != 0): break tweet.label = value print tweet.label, " " ,tweet.message
def setTweetsEmotion(): #Get Emotions arrEmo = getEmotions() #Analyse Tweet Emotion data = open("data/scrapeCompanies.txt") #Read every tweet for line in IO.readData_by_line(data): tweet = Tweet.Tweet() tweet.setTweet(line) #Check every emotion value = 0 for emo in arrEmo: word = emo[0] if word in tweet.message: #Update value by emotion if emo[1] == "1": value = 1 else: value = -1 if (value != 0): break tweet.label = value print tweet.label, " ", tweet.message
def getStopWords(): arr = IO.readData("data/StopWords.txt") return arr
def getEmotions(): arr = IO.readData("data/Emotions.txt") for index in range(0, len(arr)): arr[index][0] = stem(arr[index][0]) return arr
def searchTwitter(tags, fileName): print "Start Twitter scraping for " + str(tags) j=1 fileName = "data/" + fileName fileExt = ".txt" sOptions = twitterRPP(100) sQuery = twitterBuildTagString(tags) # If data file exists, read latest tweet, otherwise skip from os.path import exists if (exists(fileName+fileExt)): lastID, lastTime = getLastTweetID(fileName+fileExt) print "Last tweet ID: " + lastID + " at time: " + lastTime sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far else: print "No file " + fileName + fileExt + " found, searching without maxID" # Initial search tweets = getTweets(sQuery + sOptions) if (len(tweets) < 2): print "No search results" return # Continue searching from oldest tweet found in every message oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query go_on = True i=1 while(go_on): sOptions2 = sOptions + twitterMaxID(oldestID) results = getTweets(sQuery + sOptions2, i) if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue go_on = False else: time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out i += 1 oldestID = results[-1][0] # Get ID of the oldest tweet for the next query tweets += results[1:] # First result is tweet with "oldestID", so drop it if (i>=250): # Backup data if we acquire a lot IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True) j += 1 tweets = [] i = 0 # Save data, if buffer has been used, read buffer files in reversed order if (j==1): IO.writeData(fileName+fileExt, tweets, True, False) else: IO.writeData(fileName+fileExt, tweets, True, False) j -= 1 while (j>=1): bfr = IO.readData(fileName + "_P" + str(j) + fileExt) IO.writeData(fileName+fileExt, bfr, True, False) IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file j -= 1 print "Finished Twitter scrape"
def getLastTweetID(sFile): line = IO.readLastLine(sFile) line = line.split('\t') return line[0], line[1]
def searchTwitter(tags, fileName, oldestID = 0, j=1): print "Start Twitter scraping for " + str(tags) fileName = "data/scrapes/" + fileName fileExt = ".txt" sOptions = twitterRPP(100) sQuery = twitterBuildTagString(tags) # If data file exists, read latest tweet, otherwise skip from os.path import exists if (exists(fileName+fileExt)): lastID, lastTime = getLastTweetID(fileName+fileExt) print "Last tweet ID: " + lastID + " at time: " + lastTime sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far else: print "No file " + fileName + fileExt + " found, searching without maxID" # Initial search if (oldestID == 0): tweets = getTweets(sQuery + sOptions) else: sOptions2 = sOptions + twitterMaxID(oldestID) tweets = getTweets(sQuery + sOptions2) if (len(tweets) < 2): print "No search results" return # Continue searching from oldest tweet found in every message oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query go_on = True i=1 while(go_on): sOptions2 = sOptions + twitterMaxID(oldestID) results = getTweets(sQuery + sOptions2, i) if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue go_on = False else: time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out i += 1 oldestID = results[-1][0] # Get ID of the oldest tweet for the next query tweets += results[1:] # First result is tweet with "oldestID", so drop it if (i>=250): # Backup data if we acquire a lot IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True) j += 1 tweets = [] i = 0 # Save data, if buffer has been used, read buffer files in reversed order if (j==1): IO.writeData(fileName+fileExt, tweets, True, False) else: IO.writeData(fileName+fileExt, tweets, True, False) j -= 1 while (j>=1): bfr = IO.readData(fileName + "_P" + str(j) + fileExt) IO.writeData(fileName+fileExt, bfr, True, False) IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file j -= 1 print "Finished " + fileName + " scrape"