示例#1
0
    def testBuildKeywordObject(self):
        ''' Just checks the ability build a keyword object with a token/keyword/hashtag '''

        twt = processTweet(self.vt)
        twt.buildKeywordObject('hello_world')
        
        # Check we see 3 objects in the list
        self.assertEquals(len(twt.keywords), 1)

        # Check one of those has a valid MGRS
        self.assertEquals(twt.keywords[0].mgrs, '38SND4595706622')
示例#2
0
    def testFromHashTag(self):
        ''' Checks getting a list of keywords from the tweets keyword attr'''
        
        # Process to keywords
        twt = processTweet(self.vt)
        twt.fromHashTag()
        
        # Check we see 3 objects in the list
        self.assertEquals(len(twt.keywords), 3)

        # Check one of those has a valid MGRS
        self.assertEquals(twt.keywords[0].mgrs, '38SND4595706622')
示例#3
0
    def testFromGazetteer(self):
        ''' Check keywords based on a gazetteer lookup.
            Initially this could just be python dict, but it could be
            extended to a SQLite or Mongo collection of names/places. '''


        twt = processTweet(self.vt)
        twt.fromLookup()
        
        print twt.keywords
        print len(twt.keywords)
        # Simple length check based on content in the tweet
        self.assertEquals(len(twt.keywords), 2)
        
        # Check one of the keywords
        self.assertEquals(twt.keywords[0].keyword, 'germ')
示例#4
0
def main(): 
    '''
    Script to build tweet objects from the VAST dataset and place them on a Queue and/or JMS
    for testing purposes.
    
    LIKELY SPEED IMPROVEMENTS:
    - BUILDING BLANK ARRAYS IN THE TIME SERIES TAKES A WHILE
    - PUTTING THE KEYWORDS IN A QUEUE, HAVING SET UP THE THREADS TO PROCESS EACH ONE.
    - ANY DUPLICATION CHECKS?
    
    
    
    '''
    db = 'bam'
    host = 'localhost'
    port = 27017
    
    start = datetime.datetime.utcnow()
    tweetProcessTimes = datetime.timedelta(seconds=0)
    
    blUnits     = 'minute'
    blPrecision = 10
    baselineParameters = [blUnits, blPrecision] 
    mgrsPrecision = 2
    
    #dripRate = 1.5
    
    # JMS destination
    #destination = '/topic/test.vasttweets'
    #hostIn      = 'localhost'
    #portIn      = 61613

    # Reset the collections
    c, dbh = mdb.getHandle()
    dbh = mdb.setupCollections(dbh, dropCollections=True)         # Set up collections
    dbh = mdb.setupIndexes(dbh)
    
    #jms = jmsCode.jmsHandler(hostIn, portIn, verbose=True)
    # Make the JMS connection via STOMP and the jmsCode class
    #jms.connect()
     
    path = "/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/data/"
    #fName= "MicroblogsSample.csv"
    fName= "MicroblogsOrdered.csv"
    tweetStats = 'tweetStatsFile_50000.csv'
    tptFile = open(path+tweetStats, 'w')
    
    # The script used to generate the baseline
    baselinePath = '/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/src/scripts/'
    baselineScript = 'subprocessBaseline.py'
    scriptFile = os.path.join(baselinePath, baselineScript)

    
    f = retrieveFile(path, fName)
    x = 0
    
    # Start time
    earliestTweet = datetime.datetime(2011, 4, 30, 0, 0)
    earliestTweet = time.mktime(time.struct_time(earliestTweet.timetuple()))
    lastTweetTime = earliestTweet
    print "First Tweet Time: ", lastTweetTime
    
    # This speeds things up from seconds to minutes
    speedUpRate = 1000
    
    # Build a blank timeseries array to save it being built everytime
    blankData = buildBlankData(hours=24)
    
    # Loop the lines build tweet objects
    for line in f.readlines():
        
        #print line
        # Extract content from each line
        line = line.rstrip('\r').rstrip('\n').rstrip('\r')

        if x == 0:
            x+=1
            continue
        
        if x % 100 == 0:
            print "processed: ", x
        
        if x >100000:
            print line
            break
            sys.exit(0)
            
        line = line.split(',')
        
        tweetProcessStart = datetime.datetime.utcnow()
        
        tweetId, dt, latLon, text = line
        
        # Get the geos
        geos = getGeos(tweetId, latLon)
        if not geos:
            print "skipping this record - bad or no geos"
            continue
        
        # Get the datetime group into seconds since UNIX time
        dtg = getTime(tweetId, dt)

        if not dtg:
            print "skipping this record - bad or no time"
            continue
        
        # Get the tweettime into seconds from UNIX
        tweetTime = time.mktime(time.struct_time(dtg.timetuple()))
        #print "The time of this tweet", tweetTime
        
        # Get the tweet time in seconds since the last tweet
        sinceLastTweet = tweetTime - lastTweetTime
        #print "Time since last tweet", sinceLastTweet
        
        #delay = sinceLastTweet / speedUpRate
        #print "Delay: ", delay
                
        # Apply a scaling to it
        #time.sleep(delay)
        
        # Assign this tweet time to be the last tweet time
        lastTweetTime = tweetTime
        
        # Build a tweet object
        twt = vastTweet()
        twt.importData(timeStamp=dtg, lat=geos[0], lon=geos[1], text=text, tweetId=tweetId)
        
        #----------------------------------------------------------------------------------
        # PROCESS INTO KEYWORDS
                
        # Build into keywords - skipping a step for development
        kywd = processTweet(twt, mgrsPrecision)
        
        # Add keywords to the list based on hashtags
        kywd.fromHashTag()
        
        # Add keywords to the list based on name lookup
        kywd.fromLookup()

        if len(kywd.keywords) == 0:
            pass
            #print "No matches: ", twt.text
        
        xx = 0
        #Now loop the resultant keywords
        for kwObj in kywd.keywords:
            
            xx += 1
            
            #print "------------------"
            #print kwObj.keyword
            #print kwObj.text
        
            #-------------------------------------------------------
            # Pass keyword object into a class
            #ts = timeSeries(host='localhost', port=27017, db='bam')
            ts = timeSeries(c=c, dbh=dbh)
            ts.importData(kwObj, blockPrecision=24)
    
            success = ts.insertDoc(blankData=blankData, incrementBy=100)
  
            callBaseliner(scriptFile, host, port, db, kwObj, baselineParameters, mac=1)
  
        # METRICS - currently about 0.05 seconds per tweet
        tweetProcessStop = datetime.datetime.utcnow()
        tweetProcessTimes += (tweetProcessStop - tweetProcessStart)
        processDif = (tweetProcessStop - tweetProcessStart) 
        tptFile.write(str(x)+","+str(xx)+","+str(processDif.seconds + processDif.microseconds/1000000.)+"\n")
        #----------------------------------------------------------------------------------
        # SEND TO JMS WITH THIS CODE

        # Convert it into a JSON object
        #jTwt = twt.vastTweet2Json()
        #print jTwt

        # Push the JSON version of the tweet to the JMS
        #jms.sendData(destination, jTwt, x)

        #----------------------------------------------------------------------------------
        
        x += 1
    
        #time.sleep(dripRate)
        
    # Disconnect from the JMS
    #jms.disConnect()    

    end = datetime.datetime.utcnow()
    dif = end - start
    
    print "Total Tweet Process Time: %s" %tweetProcessTimes.seconds
    print "Average Tweet process time: %s" % (float(tweetProcessTimes.seconds)/float(x))

    print "Tweet Processed: %s" %x
    print "Total Process Time: %s" %(dif)
    
    # Close the mongo connection
    mdb.close(c, dbh)
    f.close()
    tptFile.close()