def testBuildKeywordObject(self): ''' Just checks the ability build a keyword object with a token/keyword/hashtag ''' twt = processTweet(self.vt) twt.buildKeywordObject('hello_world') # Check we see 3 objects in the list self.assertEquals(len(twt.keywords), 1) # Check one of those has a valid MGRS self.assertEquals(twt.keywords[0].mgrs, '38SND4595706622')
def testFromHashTag(self): ''' Checks getting a list of keywords from the tweets keyword attr''' # Process to keywords twt = processTweet(self.vt) twt.fromHashTag() # Check we see 3 objects in the list self.assertEquals(len(twt.keywords), 3) # Check one of those has a valid MGRS self.assertEquals(twt.keywords[0].mgrs, '38SND4595706622')
def testFromGazetteer(self): ''' Check keywords based on a gazetteer lookup. Initially this could just be python dict, but it could be extended to a SQLite or Mongo collection of names/places. ''' twt = processTweet(self.vt) twt.fromLookup() print twt.keywords print len(twt.keywords) # Simple length check based on content in the tweet self.assertEquals(len(twt.keywords), 2) # Check one of the keywords self.assertEquals(twt.keywords[0].keyword, 'germ')
def main(): ''' Script to build tweet objects from the VAST dataset and place them on a Queue and/or JMS for testing purposes. LIKELY SPEED IMPROVEMENTS: - BUILDING BLANK ARRAYS IN THE TIME SERIES TAKES A WHILE - PUTTING THE KEYWORDS IN A QUEUE, HAVING SET UP THE THREADS TO PROCESS EACH ONE. - ANY DUPLICATION CHECKS? ''' db = 'bam' host = 'localhost' port = 27017 start = datetime.datetime.utcnow() tweetProcessTimes = datetime.timedelta(seconds=0) blUnits = 'minute' blPrecision = 10 baselineParameters = [blUnits, blPrecision] mgrsPrecision = 2 #dripRate = 1.5 # JMS destination #destination = '/topic/test.vasttweets' #hostIn = 'localhost' #portIn = 61613 # Reset the collections c, dbh = mdb.getHandle() dbh = mdb.setupCollections(dbh, dropCollections=True) # Set up collections dbh = mdb.setupIndexes(dbh) #jms = jmsCode.jmsHandler(hostIn, portIn, verbose=True) # Make the JMS connection via STOMP and the jmsCode class #jms.connect() path = "/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/data/" #fName= "MicroblogsSample.csv" fName= "MicroblogsOrdered.csv" tweetStats = 'tweetStatsFile_50000.csv' tptFile = open(path+tweetStats, 'w') # The script used to generate the baseline baselinePath = '/Users/brantinghamr/Documents/Code/eclipseWorkspace/bam/src/scripts/' baselineScript = 'subprocessBaseline.py' scriptFile = os.path.join(baselinePath, baselineScript) f = retrieveFile(path, fName) x = 0 # Start time earliestTweet = datetime.datetime(2011, 4, 30, 0, 0) earliestTweet = time.mktime(time.struct_time(earliestTweet.timetuple())) lastTweetTime = earliestTweet print "First Tweet Time: ", lastTweetTime # This speeds things up from seconds to minutes speedUpRate = 1000 # Build a blank timeseries array to save it being built everytime blankData = buildBlankData(hours=24) # Loop the lines build tweet objects for line in f.readlines(): #print line # Extract content from each line line = line.rstrip('\r').rstrip('\n').rstrip('\r') if x == 0: x+=1 continue if x % 100 == 0: print "processed: ", x if x >100000: print line break sys.exit(0) line = line.split(',') tweetProcessStart = datetime.datetime.utcnow() tweetId, dt, latLon, text = line # Get the geos geos = getGeos(tweetId, latLon) if not geos: print "skipping this record - bad or no geos" continue # Get the datetime group into seconds since UNIX time dtg = getTime(tweetId, dt) if not dtg: print "skipping this record - bad or no time" continue # Get the tweettime into seconds from UNIX tweetTime = time.mktime(time.struct_time(dtg.timetuple())) #print "The time of this tweet", tweetTime # Get the tweet time in seconds since the last tweet sinceLastTweet = tweetTime - lastTweetTime #print "Time since last tweet", sinceLastTweet #delay = sinceLastTweet / speedUpRate #print "Delay: ", delay # Apply a scaling to it #time.sleep(delay) # Assign this tweet time to be the last tweet time lastTweetTime = tweetTime # Build a tweet object twt = vastTweet() twt.importData(timeStamp=dtg, lat=geos[0], lon=geos[1], text=text, tweetId=tweetId) #---------------------------------------------------------------------------------- # PROCESS INTO KEYWORDS # Build into keywords - skipping a step for development kywd = processTweet(twt, mgrsPrecision) # Add keywords to the list based on hashtags kywd.fromHashTag() # Add keywords to the list based on name lookup kywd.fromLookup() if len(kywd.keywords) == 0: pass #print "No matches: ", twt.text xx = 0 #Now loop the resultant keywords for kwObj in kywd.keywords: xx += 1 #print "------------------" #print kwObj.keyword #print kwObj.text #------------------------------------------------------- # Pass keyword object into a class #ts = timeSeries(host='localhost', port=27017, db='bam') ts = timeSeries(c=c, dbh=dbh) ts.importData(kwObj, blockPrecision=24) success = ts.insertDoc(blankData=blankData, incrementBy=100) callBaseliner(scriptFile, host, port, db, kwObj, baselineParameters, mac=1) # METRICS - currently about 0.05 seconds per tweet tweetProcessStop = datetime.datetime.utcnow() tweetProcessTimes += (tweetProcessStop - tweetProcessStart) processDif = (tweetProcessStop - tweetProcessStart) tptFile.write(str(x)+","+str(xx)+","+str(processDif.seconds + processDif.microseconds/1000000.)+"\n") #---------------------------------------------------------------------------------- # SEND TO JMS WITH THIS CODE # Convert it into a JSON object #jTwt = twt.vastTweet2Json() #print jTwt # Push the JSON version of the tweet to the JMS #jms.sendData(destination, jTwt, x) #---------------------------------------------------------------------------------- x += 1 #time.sleep(dripRate) # Disconnect from the JMS #jms.disConnect() end = datetime.datetime.utcnow() dif = end - start print "Total Tweet Process Time: %s" %tweetProcessTimes.seconds print "Average Tweet process time: %s" % (float(tweetProcessTimes.seconds)/float(x)) print "Tweet Processed: %s" %x print "Total Process Time: %s" %(dif) # Close the mongo connection mdb.close(c, dbh) f.close() tptFile.close()