Exemplo n.º 1
0
def classifyTweets(tweetFile, history, tag, sSaveFile, offset=3):
    stopWords = getStopWords()
    tweets = []    
    for line in IO.readData_by_line(tweetFile):
        tweet = Tweet.Tweet()
        tweet.setTweet(line)
        if(tweet.containsTag("#" + tag)):
            stamp = tweet.date + timedelta(days=offset)
            if stamp.date() in history:
                tweet.label = history[stamp.date()]
                tweet.removeStopWords(stopWords)
                tweets.append(tweet)
    print len(tweets)
    tweetFile.close()
    IO.writeTweets(sSaveFile, tweets, ['label', 'trimmedMessage'])
Exemplo n.º 2
0
def classifyTweets(tweetFile, history, tag, sSaveFile, offset=3):
    stopWords = getStopWords()
    tweets = []
    for line in IO.readData_by_line(tweetFile):
        tweet = Tweet.Tweet()
        tweet.setTweet(line)
        if (tweet.containsTag("#" + tag)):
            stamp = tweet.date + timedelta(days=offset)
            if stamp.date() in history:
                tweet.label = history[stamp.date()]
                tweet.removeStopWords(stopWords)
                tweets.append(tweet)
    print len(tweets)
    tweetFile.close()
    IO.writeTweets(sSaveFile, tweets, ['label', 'trimmedMessage'])
Exemplo n.º 3
0
def classifyTweetsCompany(tag, _offset=3):
    tweetFile = open("data/scrapeCompanies.txt")
    priceData = IO.readData("data/" + tag + ".csv", ',')
    priceIter = iter(priceData)
    next(priceIter)
    priceHist = priceHistory(priceIter, "%Y-%m-%d", 2)
    
    classifyTweets(tweetFile, priceHist, tag, "data/Classified" + tag + ".txt", offset=_offset)
Exemplo n.º 4
0
def classifyTweetsDJIA(_offset=3):
    tweetFile = open("data/scrapeDJIA.txt")
    priceData = IO.readData("data/DJIA.tsv")
    priceHist = priceHistory(priceData, "%b %d, %Y", 1)

    classifyTweets(tweetFile,
                   priceHist,
                   "DJIA",
                   "data/ClassifiedDJIA.txt",
                   offset=_offset)
Exemplo n.º 5
0
def classifyTweetsCompany(tag, _offset=3):
    tweetFile = open("data/scrapeCompanies.txt")
    priceData = IO.readData("data/" + tag + ".csv", ',')
    priceIter = iter(priceData)
    next(priceIter)
    priceHist = priceHistory(priceIter, "%Y-%m-%d", 2)

    classifyTweets(tweetFile,
                   priceHist,
                   tag,
                   "data/Classified" + tag + ".txt",
                   offset=_offset)
Exemplo n.º 6
0
def setTweetsEmotion():
    #Get Emotions
    arrEmo = getEmotions()
    #Analyse Tweet Emotion
    data = open("data/scrapeCompanies.txt")    
    #Read every tweet
    for line in IO.readData_by_line(data):
        tweet = Tweet.Tweet()
        tweet.setTweet(line)
        #Check every emotion
        value = 0
        for emo in arrEmo:
            word = emo[0]
            if word in tweet.message:
                #Update value by emotion
                if emo[1] == "1":
                    value = 1
                else:
                    value = -1
                if(value != 0):
                    break
            tweet.label = value
        print tweet.label, " " ,tweet.message
Exemplo n.º 7
0
def setTweetsEmotion():
    #Get Emotions
    arrEmo = getEmotions()
    #Analyse Tweet Emotion
    data = open("data/scrapeCompanies.txt")
    #Read every tweet
    for line in IO.readData_by_line(data):
        tweet = Tweet.Tweet()
        tweet.setTweet(line)
        #Check every emotion
        value = 0
        for emo in arrEmo:
            word = emo[0]
            if word in tweet.message:
                #Update value by emotion
                if emo[1] == "1":
                    value = 1
                else:
                    value = -1
                if (value != 0):
                    break
            tweet.label = value
        print tweet.label, " ", tweet.message
Exemplo n.º 8
0
def classifyTweetsDJIA(_offset=3):
    tweetFile = open("data/scrapeDJIA.txt")
    priceData = IO.readData("data/DJIA.tsv")
    priceHist = priceHistory(priceData, "%b %d, %Y", 1)
    
    classifyTweets(tweetFile, priceHist, "DJIA", "data/ClassifiedDJIA.txt", offset=_offset)
Exemplo n.º 9
0
def getStopWords():
    arr = IO.readData("data/StopWords.txt")
    return arr
Exemplo n.º 10
0
def getEmotions():
    arr = IO.readData("data/Emotions.txt")
    for index in range(0, len(arr)):
        arr[index][0] = stem(arr[index][0])
    return arr
Exemplo n.º 11
0
def getEmotions():
    arr = IO.readData("data/Emotions.txt")
    for index in range(0, len(arr)):
        arr[index][0] = stem(arr[index][0])
    return arr
Exemplo n.º 12
0
def searchTwitter(tags, fileName):    
    print "Start Twitter scraping for " + str(tags)
    j=1
    fileName = "data/" + fileName
    fileExt = ".txt"
    sOptions = twitterRPP(100)
    sQuery = twitterBuildTagString(tags)
    
    # If data file exists, read latest tweet, otherwise skip
    from os.path import exists
    if (exists(fileName+fileExt)):
        lastID, lastTime = getLastTweetID(fileName+fileExt)
        print "Last tweet ID: " + lastID + " at time: " + lastTime
        sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far
    else:
        print "No file " + fileName + fileExt + " found, searching without maxID"
    
    # Initial search
    tweets = getTweets(sQuery + sOptions)
    if (len(tweets) < 2):
        print "No search results"
        return
    
    # Continue searching from oldest tweet found in every message
    oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query
    go_on = True
    i=1    
    
    while(go_on):        
        sOptions2 = sOptions + twitterMaxID(oldestID)
        results = getTweets(sQuery + sOptions2, i)
        
        if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue
            go_on = False            
        else:
            time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out
            i += 1
            oldestID = results[-1][0] # Get ID of the oldest tweet for the next query
            
        tweets += results[1:] # First result is tweet with "oldestID", so drop it
        
        if (i>=250): # Backup data if we acquire a lot
            IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True)
            j += 1
            tweets = []
            i = 0
    
    # Save data, if buffer has been used, read buffer files in reversed order
    if (j==1):
        IO.writeData(fileName+fileExt, tweets, True, False)
    else:
        IO.writeData(fileName+fileExt, tweets, True, False)
        j -= 1
        while (j>=1):
            bfr = IO.readData(fileName + "_P" + str(j) + fileExt)            
            IO.writeData(fileName+fileExt, bfr, True, False)
            IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file
            j -= 1
    print "Finished Twitter scrape"
Exemplo n.º 13
0
def getLastTweetID(sFile):
    line = IO.readLastLine(sFile)
    line = line.split('\t')
    return line[0], line[1]
Exemplo n.º 14
0
def getStopWords():
    arr = IO.readData("data/StopWords.txt")
    return arr
Exemplo n.º 15
0
def searchTwitter(tags, fileName, oldestID = 0, j=1):    
    print "Start Twitter scraping for " + str(tags)
    fileName = "data/scrapes/" + fileName
    fileExt = ".txt"
    sOptions = twitterRPP(100)
    sQuery = twitterBuildTagString(tags)
    
    # If data file exists, read latest tweet, otherwise skip
    from os.path import exists
    if (exists(fileName+fileExt)):
        lastID, lastTime = getLastTweetID(fileName+fileExt)
        print "Last tweet ID: " + lastID + " at time: " + lastTime
        sOptions += twitterSinceID(lastID) # Manual assignment of the newest tweet we scraped so far
    else:
        print "No file " + fileName + fileExt + " found, searching without maxID"
    
    # Initial search
    if (oldestID == 0):
        tweets = getTweets(sQuery + sOptions)
    else:
        sOptions2 = sOptions + twitterMaxID(oldestID)
        tweets = getTweets(sQuery + sOptions2)
    if (len(tweets) < 2):
        print "No search results"
        return
    
    # Continue searching from oldest tweet found in every message
    oldestID = tweets[-1][0] # Get ID of the oldest tweet for the next query
    go_on = True
    i=1    
    
    while(go_on):        
        sOptions2 = sOptions + twitterMaxID(oldestID)
        results = getTweets(sQuery + sOptions2, i)
        
        if (len(results) < 2): # Catch empty results, errors and sleep if we'll continue
            go_on = False            
        else:
            time.sleep(1.1) # Sleep a bit so twitter doesn't throw us out
            i += 1
            oldestID = results[-1][0] # Get ID of the oldest tweet for the next query
            
        tweets += results[1:] # First result is tweet with "oldestID", so drop it
        
        if (i>=250): # Backup data if we acquire a lot
            IO.writeData(fileName + "_P" + str(j) + fileExt, tweets, overWrite=True)
            j += 1
            tweets = []
            i = 0
    
    # Save data, if buffer has been used, read buffer files in reversed order
    if (j==1):
        IO.writeData(fileName+fileExt, tweets, True, False)
    else:
        IO.writeData(fileName+fileExt, tweets, True, False)
        j -= 1
        while (j>=1):
            bfr = IO.readData(fileName + "_P" + str(j) + fileExt)            
            IO.writeData(fileName+fileExt, bfr, True, False)
            IO.deleteFile(fileName + "_P" + str(j) + fileExt) # Remove temporary file
            j -= 1
    print "Finished " + fileName + " scrape"
Exemplo n.º 16
0
def getLastTweetID(sFile):
    line = IO.readLastLine(sFile)
    line = line.split('\t')
    return line[0], line[1]