예제 #1
0
def downloadAllTweets(start, end, tweetTargetSource):
	numRequests = 0
	tweetIdSource = 'data/tweets.csv'
	dataLines = readFile(tweetIdSource).splitlines()[start:end]
	reducedDataLines = sample(dataLines, 400000)	#Choosing random sample of 400000\
	fieldnames = ['id_str', 'created_at', 'coordinates', 'hashtags', 'text']
	setHeaders(tweetTargetSource, fieldnames)
	requestStartRow = 0
	requestEndRow = 99
	while(requestEndRow <= len(reducedDataLines)):
		print "Parsing rows " + str(requestStartRow) + '-' + str(requestStartRow + 6000) + ' out of ' + str(len(reducedDataLines))
		print str(float(requestStartRow)/float(len(reducedDataLines)) * 100) + "% complete"
		#Inner while loop handles API Rate limit logic
		while (numRequests < 60):
			if (getRateLimit('statuses')['resources']['statuses']['/statuses/lookup']['remaining'] == 0):
				break
			tweetIdList = parseTweetIds(reducedDataLines, requestStartRow, requestEndRow)
			tweetData = getTweetsFromId(tweetIdList)
			print "Request: " + str(numRequests)
			if (tweetData != None): writeToFile(tweetData, tweetTargetSource, fieldnames)
			requestStartRow += 100
			requestEndRow += 100
			numRequests += 1
		print "Rate Limit Exceeded. Waiting...\n"
		while (getRateLimit('statuses')['resources']['statuses']['/statuses/lookup']['remaining'] == 0):
			sleep(120)	#Suspends execution until rate limit refreshed
		numRequests = 0
	print "Done!"
def saveTweetsToFile(tweetsAboutCountries, targetFileBasePath):
	fieldnames = ['id_str', 'created_at', 'coordinates', 'hashtags', 'text', 'distanceFromCountry']
	for country in tweetsAboutCountries:
		countryFilePath = targetFileBasePath + country + '_tweets.csv'
		setHeaders(countryFilePath, fieldnames)
		writeToFile(tweetsAboutCountries[country], countryFilePath, fieldnames)