def get_twitter_json(list_of_filenames, starting_at=1, ending_at=0, geocode=True): """ 1) reads in a list of fully-qualified filenames from "list_of_filenames" 2) processes each row of each topsy file in the "list_of_filenames", making batched calls to Twitter to retrieve the json for each tweet adding the data from the topsy file such as "score" plus unix timestamps for the topsy firstpost_date field and Twitter's created_at field plus coordinate and place name data for Twitter's user location field - after every 13,500 rows, or whenever there is a threshold-exceeded error the program goes to sleep for 15 minutes. Note: a file named twitter_credentials.py must be in the folder with the code see the repo: it contains your Twitter credentials Note: if geocode=True a file named mapquest_key.txt must be in the folder with the code get a MapQuest key here: http://developer.mapquest.com/ Input: list_of_filenames a text file with fully-qualified file names starting_at the line number of "list_of_filenames" where processing should start ending_at if 0 process all files beginning with the "starting_at" line in "list_of_filenames" if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames" geocode if True, batched requests are made to the MapQuest Developers API for coordinate and place name data if False, these call are not made and no geo info is added Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number Usage: %run get_twitter_json.py "filename_list.csv" 2 2 - or - nohup python get_twitter_json.py "filename_list.csv" 1 0 & A message like "6 skipped id 448176144668721152" means that Twitter failed to return any data about a tweet with id 448... and that this is the 6th instance of this. """ import csv, json import re import time, datetime import sys, os import urllib2,urllib import os.path from twitter_functions import lookup_multiple_tweets # convert input parameter strings to integer starting_at = int(starting_at) ending_at = int(ending_at) geocode = bool(geocode) msg = "\nlist_of_filenames %s; starting_at %d; ending_at %d; geocode %d"%(list_of_filenames,starting_at,ending_at,geocode) logging.info(msg) process_start = datetime.datetime.now() msg = "\n=======================================\nprocess start: %s"%process_start.strftime("%c") + \ "\n=======================================\n" print msg sys.stdout.flush() logging.info(msg) # read the list of filenames into "filename_list" # =============================================== filename_list = [] with open(list_of_filenames, "rb") as namefile: csv_reader = csv.reader(namefile) for row in csv_reader: filename_list.extend(row) output_filename = "bigtweet_file" + "%03d"%(starting_at,) + ".json" step = 100 # we're going to process in groups of "step" bulk_list = [] # batch of rows from input file list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file # the Twitter rate limits are documented here # https://dev.twitter.com/docs/rate-limiting/1.1/limits sleep_batch = 13500 # we sleep after this many lines processed sleep_batch_rows = 0 # the number of lines we've processes since the last sleep # MapQuest Developer API documentation: http://developer.mapquest.com/ Geocoder_count = 0 # how many records did did we Geocode? if geocode: f = open('mapquest_key.txt','r') key = f.readline() f.close() mapq_url = 'http://www.mapquestapi.com/geocoding/v1/batch?key=' mapq_url = mapq_url + key + '&outFormat=json&maxResults=1&callback=renderBatch' logging.info("MAPQUEST URL " + mapq_url) number_of_files = len(filename_list) # how many files in the list file_counter = 1 # which one is this one global first_sleep first_sleep = True # first time through, we write an output_file header invalid_json = False # in case Twitter sends us junk global total_processed total_processed = 0 # how many rows have we processed skip_counter = 0 # how many rows did we skip because Twitter didn't send us info # read each file in and process it # ================================== for input_filename in filename_list: # skip the first "starting_at-1" files if file_counter < starting_at: msg = "Skipping %d of %d %s"%(file_counter, number_of_files, input_filename) print msg logging.info(msg) file_counter+=1 continue if ending_at != 0: number_of_files = ending_at # find the shortened file name # # note: if your filenames do not fit my convention # replace the two lines below with # # short_file_name = input_filename # #match = re.search(r"Twitter Data\\(.*)", input_filename) # Windows Google Drive match = re.search("/home/ubuntu/files(.*)", input_filename) # AWS Ubuntu short_file_name = match.group(1) # stop if we're beyond "ending_at" if ending_at > 0: if file_counter > ending_at: msg = "Ending before %d of %d %s"%(file_counter, number_of_files, input_filename) print msg logging.info(msg) break # check that the file exists if not os.path.isfile(input_filename): msg = "%s does not exist"%input_filename print msg logging.info(msg) file_counter+=1 continue # open an input file with open(input_filename, "rb" ) as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file msg = "\n--Processing %d of %d %s rows %d"%(file_counter, number_of_files, short_file_name,totallines) print msg logging.info(msg) sys.stdout.flush() # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # sleep if we're over the limit of lines processed sleep_batch_rows+=1 if sleep_batch_rows > sleep_batch: msg = "sleeping after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) print msg logging.info(msg) sleep_batch_rows = 0 sleep_process(output_dict, output_filename) # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] # make sure tweet_id is actually numeric if re.match(r"^\d+", tweet_id): # Successful match at the start of the string row['id'] = tweet_id bulk_list.append(row) list_of_tweet_ids.append(tweet_id) else: msg = "tweet url terminated with non-numeric in line %d"%(linenum+1) print msg logging.info(msg) print row['url'] logging.info(row['url']) # if batch-size reached, process the batch if len(list_of_tweet_ids) >= step or (linenum+1) >= totallines: # make a batch request to Twitter # =============================== result = lookup_multiple_tweets(list_of_tweet_ids) list_of_tweet_ids = [] for foo in result: try: tweetdata_list = json.loads(foo) break except ValueError, e: msg = "\nTwitter returned invalid json" print msg logging.info(msg) print e logging.info(e) msg = "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) print msg logging.info(msg) bulk_list = [] invalid_json = True break if invalid_json: invalid_json = False break # if Twitter returns an error if 'errors' in tweetdata_list: msg = "Twitter returned an error message:\n" + \ "message: " + str(tweetdata_list["errors"][0]['message']) + \ "\ncode: " + str(tweetdata_list["errors"][0]['code']) + \ "\nafter %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) print msg logging.info(msg) sleep_batch_rows = 0 sleep_process(tweetdata_list, output_filename) bulk_list = [] # we lose the batch continue # Twitter's response is in arbitrary order and doesn't necessarily # contain a response for every id we requested # # So we create a dictionary for the tweetdata_list # associating id's with their position in the list # and a list of id's for searching tweet_id_dict = {} tweet_id_list = [] tweet_loc_dict = {} tweet_loc_list = [] # save every id in tweetdata_list and its position for i in range(len(tweetdata_list)): id = str(tweetdata_list[i]['id']) tweet_id_dict[id] = i tweet_id_list.append(id) # save every location and its position if tweetdata_list[i]['user']['location'] is not None and tweetdata_list[i]['user']['location'].strip() != "": try: loc = str(tweetdata_list[i]['user']['location']) tweet_loc_dict[loc] = i tweet_loc_list.append(loc) except: pass # pull each of the lines and its corresponding Twitter response for line in bulk_list: if line['id'] not in tweet_id_list: skip_counter+=1 # check the entire line['id'] is numeric if re.match(r"^\d+", line['id']): # yes msg = "%d skipped id %d"%(skip_counter, int(line['id'])) print msg logging.info(msg) else: # no print skip_counter logging.info(skip_counter) msg = "line['id'] is not all numeric" print msg logging.info(msg) print line['id'] logging.info(line['id']) continue tweetdata = tweetdata_list[tweet_id_dict[line['id']]] if str(line['id']) != str(tweetdata['id']): skip_counter+=1 msg = "id mismatch, skipping %d"%(skip_counter) + \ "\nline id %s"%(str(line['id'])) + \ "\ntweet id %s"%(str(tweetdata['id'])) print msg logging.info(msg) continue # =========================================== # add Topsy fields to Twitter's json response # =========================================== # add a timestamp for 'created_at' # time.ctime(tweet['timestamp']) # will decode this field tweetdata_list[tweet_id_dict[line['id']]]['timestamp'] = \ time.mktime(datetime.datetime.strptime(tweetdata_list[tweet_id_dict[line['id']]]['created_at'], '%a %b %d %H:%M:%S +0000 %Y').timetuple()) tweetdata_list[tweet_id_dict[line['id']]]['topsy'] = {} # add a timestamp for topsy's 'firstpost_date' try: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \ time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%Y").timetuple()) except: try: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \ time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%y").timetuple()) except: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = "" # add the topsy csv file fields to the Twitter json tweetdata_list[tweet_id_dict[line['id']]]['topsy']['firstpost_date'] = line['firstpost_date'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['score'] = float(line['score']) tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_nick'] = line['trackback_author_nick'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_url'] = line['trackback_author_url'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_permalink'] = line['trackback_permalink'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['url'] = line['url'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['file_counter'] = file_counter tweetdata_list[tweet_id_dict[line['id']]]['topsy']['short_file_name'] = short_file_name # ======================================= # add geo data to Twitter's json response # ======================================= if geocode: # give everybody a blank for idx in range(len(tweetdata_list)): tweetdata_list[idx]["user"]["location_geoinfo"] = {} # create a list of locations to send to MapQuest loc_url = '' for tweet_loc in tweet_loc_list: loc_url = loc_url + '&location=' + tweet_loc # send 'em urllib.urlretrieve (mapq_url + loc_url, "batch.json") # get the answer batch = open("batch.json","r") lines = batch.readlines() batch.close() # what they send back has superfluous stuff at the front and back ends match = [] if lines: match = re.search(r"renderBatch\((\{.*\})", lines[0]) if match: result = match.group(1) try: locs = json.loads(result) # step through MapQuest's response and add data to Twitter's json response for results in locs['results']: if results['providedLocation']['location'] in tweet_loc_dict.keys(): #=========================== # an idea for an alternative #=========================== #needle = results['providedLocation']['location'] #haystack = tweet_loc_dict.keys() #indices = [i for i, s in enumerate(haystack) if needle in s] dict_loc = tweet_loc_dict[results['providedLocation']['location']] tweetdata_list[dict_loc]["user"]["location_geoinfo"] = results['locations'][0] if tweetdata_list[dict_loc]["user"]["location_geoinfo"]: Geocoder_count += 1 else: logging.warning("\nMAPQUEST KEY MISMATCH") logging.warning(tweet_loc_dict.keys()) logging.warning(results) except: msg = "MapQuest sent invalid json" print msg logging.warning(msg) logging.warning(lines) else: msg = "MapQuest sent empty response" print msg logging.warning(msg) logging.warning(lines) # process the json file and start over with a new batch from Twitter # ================================================================== process_output_file(tweetdata_list, output_filename) bulk_list = [] file_counter+=1
def create_bulkfile(list_of_filenames, starting_at=1, ending_at=0): """ - reads in a list of fully-qualified filenames from "list_of_filenames" I'm expecting file names to have the Windows Google Drive structure, for example ... Twitter Data\June\Cardiovasucular\Tweets_AFib.csv the code is commented with a simple solution you can implement to allow you to have any arbitrary fully-qualified filename, for any operating system - processes each row of each file in the file list, making batched calls to Twitter to retrieve the data for each tweet - after every 13,500 rows, or whenever there is a threshold-exceeded error the output_file is written and the program goes to sleep for 15 minutes. Note: AFINN-111.txt must be in the same folder you can use it as is or include your own n-grams the 'sentiment' field is the sum of the scores of all the n-grams found Note: Requires pygeocoder Input: list_of_filenames a text file with fully-qualified file names starting_at the line number of "list_of_filenames" where processing should start ending_at if 0 process all files beginning with the "starting_at" line in "list_of_filenames" if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames" Output: a csv file named "bigtweet_filexxx.csv", where xxx is the "starting_at" number Usage: %run create_bulkfile.py "filename_list.csv" 1 0 A message like "263 skipped id 463811853097787392" indicates that Twitter did not return data for a tweet with the id of 463811853097787392 and this is the 263rd instance of this. As a result of this and other less-common errors the output file will have fewer rows than the total rows in the input files. """ import csv import json import re import time import sys import six import datetime from twitter_functions import lookup_multiple_tweets from twitter_functions import parse_AFINN # convert input parameter strings to integer starting_at = int(starting_at) ending_at = int(ending_at) process_start = datetime.datetime.now() print "\n================================" print "process start: %s"%process_start.strftime("%c") print "================================\n" # read the list of filenames into "filename_list" # =============================================== filename_list = [] with open(list_of_filenames, "rb") as namefile: csv_reader = csv.reader(namefile) for row in csv_reader: filename_list.extend(row) output_filename = "bigtweet_file" + "%03d"%(starting_at,) + ".csv" step = 100 # we're going to process in groups of "step" bulk_list = [] # batch of rows from input file list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file # the Twitter rate limits are documented here # https://dev.twitter.com/docs/rate-limiting/1.1/limits sleep_batch = 13500 # we sleep after this many lines processed sleep_batch_rows = 0 # the number of lines we've processes since the last sleep number_of_files = len(filename_list) # how many files in the list file_counter = 1 # which one is this one first_sleep = True # first time through, we write an output_file header invalid_json = False # in case Twitter sends us junk skip_counter = 0 # how many rows did we skip because Twitter didn't send us info # read in the n-grams for sentiment processing sentiment_words, sentiment_phrases = parse_AFINN("AFINN-111.txt") # read each file in and process it # ================================== for input_filename in filename_list: # skip the first "starting_at-1" files if file_counter < starting_at: print "Skipping %d of %d %s"%(file_counter, number_of_files, input_filename) file_counter+=1 continue if ending_at != 0: number_of_files = ending_at # find the shortened file name # # note: if your filenames do not fit my convention # replace the two lines below with # # short_file_name = input_filename # match = re.search(r"Twitter Data\\(.*)", input_filename) short_file_name = match.group(1) # stop if we're beyond "ending_at" if ending_at > 0: if file_counter > ending_at: print "Ending before %d of %d %s"%(file_counter, number_of_files, input_filename) break # open an input file with open(input_filename, "rb" ) as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file print "\n--Processing %d of %d %s rows %d"%(file_counter, number_of_files, short_file_name,totallines) # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # sleep if we're over the limit of lines processed sleep_batch_rows+=1 if sleep_batch_rows > sleep_batch: print "sleeping after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(output_dict, output_filename, first_sleep) # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] # make sure tweet_id is actually numeric if re.match(r"^\d+", tweet_id): # Successful match at the start of the string row['id'] = tweet_id bulk_list.append(row) list_of_tweet_ids.append(tweet_id) else: print "tweet url terminated with non-numeric in line %d"%(linenum+1) print row['url'] # if batch-size reached, process the batch if len(bulk_list) >= step or (linenum+1) >= totallines: # make a batch request to Twitter # =============================== while True: result = lookup_multiple_tweets(list_of_tweet_ids) if result: break print "\nTwitter returned an empty result\n" time.sleep(1) list_of_tweet_ids = [] for foo in result: try: tweetdata_list = json.loads(foo) break except ValueError, e: print "\nTwitter returned invalid json" print e print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) bulk_list = [] invalid_json = True break if invalid_json: invalid_json = False break # if Twitter returns an error # # better process # try: # statuses = api.GetUserTimeline(u.id) # print [s.text for s in statuses] # except TwitterError, t: # print t if 'errors' in tweetdata_list: print "Twitter returned an error message:" print "message: " + str(tweetdata_list["errors"][0]['message']) print "code: " + str(tweetdata_list["errors"][0]['code']) print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(output_dict, output_filename, first_sleep) bulk_list = [] # we lose the batch continue # Twitter's response is in arbitrary order and doesn't necessarily # contain a response for every id we requested # # So we create a dictionary for the tweetdata_list # associating id's with their position in the list # and a list of id's for searching tweet_id_dict = {} tweet_id_list = [] # find every id in tweetdata_list and its position for i in range(len(tweetdata_list)): id = str(tweetdata_list[i]['id']) tweet_id_dict[id] = i tweet_id_list.append(id) # pull each of the lines and its corresponding Twitter response batch_process_count = 0 for line in bulk_list: if line['id'] not in tweet_id_list: skip_counter+=1 # check the entire line['id'] is numeric if re.match(r"^\d+", line['id']): # yes print "%d skipped id %d"%(skip_counter, int(line['id'])) else: # no print skip_counter print "line['id'] is not all numeric" print line['id'] continue tweetdata = tweetdata_list[tweet_id_dict[line['id']]] if str(line['id']) != str(tweetdata['id']): skip_counter+=1 print "id mismatch, skipping %d"%(skip_counter) print "line id %s"%(str(line['id'])) print "tweet id %s"%(str(tweetdata['id'])) continue # parse Twitter's response line["file_counter"] = file_counter line["short_file_name"] = short_file_name line = parse_tweet_json(line, tweetdata) line['sentiment'] = find_sentiment(tweetdata, sentiment_words, sentiment_phrases) output_dict.append(line) batch_process_count+=1 print "Rows processed: " + str(len(output_dict)) bulk_list = [] file_counter+=1
def add_twitter_data_bulk(input_filename): """ reads in a *.csv file from the Coursolve Healthcare Twitter Analysis project and produces an output *.csv file with a number of Twitter fields added The name of the output file is the name of the input file with "_full" appended Notes: 1. "twitter_functions.py" must be in your folder or somewhere on your path 2. You must provide your own file named "twitter_credentials.py" (see https://apps.twitter.com/) written like this: def twitter_credentials(): api_key = " your credentials " api_secret = " your credentials " access_token_key = " your credentials " access_token_secret = " your credentials " return (api_key,api_secret,access_token_key,access_token_secret) 3. You need to be aware that Twitter throttles your activity. This function makes bulk calls to Twitter to try to increase our throughput over add_twitter_data.py which makes one call to Twitter for every line 4. IPython usage: (1) from add_twitter_data_bulk import add_twitter_data_bulk add_twitter_data_bulk("Tweets_BleedingDisorders.csv") (2) %run add_twitter_data_bulk.py "Tweets_BleedingDisorders.csv" 5. If you have problems, I'll try to help ... [email protected] """ import csv import json from twitter_functions import lookup_multiple_tweets from twitter_functions import parse_tweet_json output_filename = input_filename.split(".")[0] + "_full.csv" step = 95 # we're going to process in groups of "step" bulk_list = [] # batch of rows from input file list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file with open(input_filename, "rb" ) as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file print "Rows in file: " + str(totallines) # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] row['id'] = tweet_id bulk_list.append(row) list_of_tweet_ids.append(tweet_id) # process the batch # ================= if len(bulk_list) >= step or (linenum+1) >= totallines: # make a batch request to Twitter result = lookup_multiple_tweets(list_of_tweet_ids) list_of_tweet_ids = [] for foo in result: tweetdata_list = json.loads(foo) break # if twitter returns an error # print the error # break => jump to output file processing if 'errors' in tweetdata_list: print "\nTwitter returned an error message:" print "message: " + tweetdata_list["errors"][0]['message'] print "code: " + str(tweetdata_list["errors"][0]['code']) print "\nIf the message is 'Rate limit exceeded', see\nhttps://dev.twitter.com/docs/rate-limiting/1.1" print "It basically seems to mean you have to wait 15 minutes" import datetime from datetime import timedelta timenow = datetime.datetime.today().strftime("%H:%M:%S") timeplus15 = (datetime.datetime.today()+timedelta(minutes=15)).strftime("%H:%M:%S") print " time now: " + timenow +"\n time in 15 minutes: " + timeplus15 print "\nAny rows of " + input_filename + " that were processed up to this point should be in the output file\n" break # Twitter's response is in an arbitrary order so sort both lists by id bulk_list = sorted(bulk_list, key=lambda k: k['id']) tweetdata_list = sorted(tweetdata_list, key=lambda k: k['id']) if len(bulk_list) != len(tweetdata_list): print "\nTwitter returned a different number of responses than we requested" print "linenum: " + str(linenum) print "Requested: " + str(len(bulk_list)) print "Received: " + str(len(tweetdata_list)) for line, tweetdata in zip(bulk_list, tweetdata_list): if str(tweetdata['id']) != str(line['id']): print "\nmismatch in ids, skipping remaining rows in this batch" print "tweetdata['id']=" + str(tweetdata['id']) print "line['id']= " + str(line['id']) break parse_tweet_json(line, tweetdata) output_dict.append(line) print "Rows processed: " + str(len(output_dict)) bulk_list = [] # create the output file # ====================== if output_dict: f = open(output_filename,'wb') w = csv.DictWriter(f, delimiter=",", fieldnames=output_dict[0].keys()) w.writeheader() w.writerows(output_dict) f.close() print output_filename + " has been created" else: print output_filename + " was NOT created"
def get_twitter_json(list_of_filenames, starting_at=1, ending_at=0, geocode=True): """ 1) reads in a list of fully-qualified filenames from "list_of_filenames" 2) processes each row of each topsy file in the "list_of_filenames", making batched calls to Twitter to retrieve the json for each tweet adding the data from the topsy file such as "score" plus unix timestamps for the topsy firstpost_date field and Twitter's created_at field plus coordinate and place name data for Twitter's user location field - after every 13,500 rows, or whenever there is a threshold-exceeded error the program goes to sleep for 15 minutes. Note: a file named twitter_credentials.py must be in the folder with the code see the repo: it contains your Twitter credentials Note: if geocode=True a file named mapquest_key.txt must be in the folder with the code get a MapQuest key here: http://developer.mapquest.com/ Input: list_of_filenames a text file with fully-qualified file names starting_at the line number of "list_of_filenames" where processing should start ending_at if 0 process all files beginning with the "starting_at" line in "list_of_filenames" if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames" geocode if True, batched requests are made to the MapQuest Developers API for coordinate and place name data if False, these call are not made and no geo info is added Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number Usage: %run get_twitter_json.py "filename_list.csv" 2 2 - or - nohup python get_twitter_json.py "filename_list.csv" 1 0 & A message like "6 skipped id 448176144668721152" means that Twitter failed to return any data about a tweet with id 448... and that this is the 6th instance of this. Fewer than 1% are skipped. To use the output file in python: ================================= import json tweet_file = open("bigtweet_file002.json", "r") for line in tweet_file: tweet = json.loads(str(line)) if tweet['retweet_count'] > 100: print "\n\n%d %s\n%s"%(tweet['retweet_count'], tweet['user']['name'], tweet['text']) To use the output file in R: ============================ library(rjson) file_path = ("../files/bigtweet_file002.json") tweet_list = fromJSON(sprintf("[%s]", paste(readLines(file_path),collapse=","))) for (i in 1:length(tweet_list)){ if (tweet_list[[i]]$retweet_count > 100){ cat(sprintf("\n\n%d %s\n%s",tweet_list[[i]]$retweet_count, tweet_list[[i]]$user$name, tweet_list[[i]]$text)) } } ## convert to twitteR structure library(twitteR) tweets = import_statuses(raw_data=tweet_list) To store in MongoDB using python: ================================= # create a python list of each tweet import json tweet_file = open("../files/bigtweet_file002.json", "r") tweet_list = [json.loads(str(line)) for line in tweet_file] # store the list in MongoDB from pymongo import MongoClient client = MongoClient() db = client['file002'] posts = db.posts #db.posts.remove( { } ) # delete if previously created posts.insert(tweet_list) # same example as above for result in db.posts.find({ "retweet_count": { "$gt": 100 } }): print "%d %s\n%s"%(result['retweet_count'],result['user']['name'],result['text']) """ import csv, json import re import time, datetime import sys, os import urllib2, urllib import os.path from twitter_functions import lookup_multiple_tweets # convert input parameter strings to integer starting_at = int(starting_at) ending_at = int(ending_at) geocode = bool(geocode) msg = "\nlist_of_filenames %s; starting_at %d; ending_at %d; geocode %d" % ( list_of_filenames, starting_at, ending_at, geocode) logging.info(msg) process_start = datetime.datetime.now() msg = "\n=======================================\nprocess start: %s"%process_start.strftime("%c") + \ "\n=======================================\n" print msg sys.stdout.flush() logging.info(msg) # read the list of filenames into "filename_list" # =============================================== filename_list = [] with open(list_of_filenames, "rb") as namefile: csv_reader = csv.reader(namefile) for row in csv_reader: filename_list.extend(row) output_filename = "bigtweet_file" + "%03d" % (starting_at, ) + ".json" step = 100 # we're going to process in groups of "step" bulk_list = [] # batch of rows from input file list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file # the Twitter rate limits are documented here # https://dev.twitter.com/docs/rate-limiting/1.1/limits sleep_batch = 13500 # we sleep after this many lines processed sleep_batch_rows = 0 # the number of lines we've processes since the last sleep # MapQuest Developer API documentation: http://developer.mapquest.com/ Geocoder_count = 0 # how many records did did we Geocode? if geocode: f = open('mapquest_key.txt', 'r') key = f.readline() f.close() mapq_url = 'http://www.mapquestapi.com/geocoding/v1/batch?key=' mapq_url = mapq_url + key + '&outFormat=json&maxResults=1&callback=renderBatch' logging.info("MAPQUEST URL " + mapq_url) number_of_files = len(filename_list) # how many files in the list file_counter = 1 # which one is this one global first_sleep first_sleep = True # first time through, we write an output_file header invalid_json = False # in case Twitter sends us junk global total_processed total_processed = 0 # how many rows have we processed skip_counter = 0 # how many rows did we skip because Twitter didn't send us info # read each file in and process it # ================================== for input_filename in filename_list: # skip the first "starting_at-1" files if file_counter < starting_at: msg = "Skipping %d of %d %s" % (file_counter, number_of_files, input_filename) print msg logging.info(msg) file_counter += 1 continue if ending_at != 0: number_of_files = ending_at # find the shortened file name # # note: if your filenames do not fit my convention # replace the two lines below with # # short_file_name = input_filename # match = re.search(r"Twitter Data\\(.*)", input_filename) # Windows Google Drive #match = re.search("/home/ubuntu/files(.*)", input_filename) # AWS Ubuntu short_file_name = match.group(1) # stop if we're beyond "ending_at" if ending_at > 0: if file_counter > ending_at: msg = "Ending before %d of %d %s" % ( file_counter, number_of_files, input_filename) print msg logging.info(msg) break # check that the file exists if not os.path.isfile(input_filename): msg = "%s does not exist" % input_filename print msg logging.info(msg) file_counter += 1 continue # open an input file with open(input_filename, "rb") as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file msg = "\n--Processing %d of %d %s rows %d" % ( file_counter, number_of_files, short_file_name, totallines) print msg logging.info(msg) sys.stdout.flush() # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # sleep if we're over the limit of lines processed sleep_batch_rows += 1 if sleep_batch_rows > sleep_batch: msg = "sleeping after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) print msg logging.info(msg) sleep_batch_rows = 0 sleep_process(output_dict, output_filename) # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] # make sure tweet_id is actually numeric if re.match(r"^\d+", tweet_id): # Successful match at the start of the string row['id'] = tweet_id bulk_list.append(row) list_of_tweet_ids.append(tweet_id) else: msg = "tweet url terminated with non-numeric in line %d" % ( linenum + 1) print msg logging.info(msg) print row['url'] logging.info(row['url']) # if batch-size reached, process the batch if len(list_of_tweet_ids) >= step or (linenum + 1) >= totallines: # make a batch request to Twitter # =============================== result = lookup_multiple_tweets(list_of_tweet_ids) list_of_tweet_ids = [] for foo in result: try: tweetdata_list = json.loads(foo) break except ValueError, e: msg = "\nTwitter returned invalid json" print msg logging.info(msg) print e logging.info(e) msg = "after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) print msg logging.info(msg) bulk_list = [] invalid_json = True break if invalid_json: invalid_json = False break # if Twitter returns an error if 'errors' in tweetdata_list: msg = "Twitter returned an error message:\n" + \ "message: " + str(tweetdata_list["errors"][0]['message']) + \ "\ncode: " + str(tweetdata_list["errors"][0]['code']) + \ "\nafter %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) print msg logging.info(msg) sleep_batch_rows = 0 sleep_process(tweetdata_list, output_filename) bulk_list = [] # we lose the batch continue # Twitter's response is in arbitrary order and doesn't necessarily # contain a response for every id we requested # # So we create a dictionary for the tweetdata_list # associating id's with their position in the list # and a list of id's for searching tweet_id_dict = {} tweet_id_list = [] tweet_loc_dict = {} tweet_loc_list = [] # save every id in tweetdata_list and its position for i in range(len(tweetdata_list)): id = str(tweetdata_list[i]['id']) tweet_id_dict[id] = i tweet_id_list.append(id) # save every location and its position if tweetdata_list[i]['user'][ 'location'] is not None and tweetdata_list[i][ 'user']['location'].strip() != "": try: loc = str( tweetdata_list[i]['user']['location']) tweet_loc_dict[loc] = i tweet_loc_list.append(loc) except: pass # pull each of the lines and its corresponding Twitter response for line in bulk_list: if line['id'] not in tweet_id_list: skip_counter += 1 # check the entire line['id'] is numeric if re.match(r"^\d+", line['id']): # yes msg = "%d skipped id %d" % (skip_counter, int(line['id'])) print msg logging.info(msg) else: # no print skip_counter logging.info(skip_counter) msg = "line['id'] is not all numeric" print msg logging.info(msg) print line['id'] logging.info(line['id']) continue tweetdata = tweetdata_list[tweet_id_dict[line['id']]] if str(line['id']) != str(tweetdata['id']): skip_counter += 1 msg = "id mismatch, skipping %d"%(skip_counter) + \ "\nline id %s"%(str(line['id'])) + \ "\ntweet id %s"%(str(tweetdata['id'])) print msg logging.info(msg) continue # =========================================== # add Topsy fields to Twitter's json response # =========================================== # add a timestamp for 'created_at' # time.ctime(tweet['timestamp']) # will decode this field tweetdata_list[tweet_id_dict[line['id']]]['timestamp'] = \ time.mktime(datetime.datetime.strptime(tweetdata_list[tweet_id_dict[line['id']]]['created_at'], '%a %b %d %H:%M:%S +0000 %Y').timetuple()) tweetdata_list[tweet_id_dict[line['id']]]['topsy'] = {} # add a timestamp for topsy's 'firstpost_date' try: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \ time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%Y").timetuple()) except: try: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \ time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%y").timetuple()) except: tweetdata_list[tweet_id_dict[ line['id']]]['topsy']['timestamp'] = "" # add the topsy csv file fields to the Twitter json tweetdata_list[tweet_id_dict[line['id']]]['topsy'][ 'firstpost_date'] = line['firstpost_date'] tweetdata_list[tweet_id_dict[ line['id']]]['topsy']['score'] = float( line['score']) tweetdata_list[tweet_id_dict[line['id']]]['topsy'][ 'trackback_author_nick'] = line[ 'trackback_author_nick'] tweetdata_list[tweet_id_dict[line['id']]]['topsy'][ 'trackback_author_url'] = line[ 'trackback_author_url'] tweetdata_list[tweet_id_dict[line['id']]]['topsy'][ 'trackback_permalink'] = line[ 'trackback_permalink'] tweetdata_list[tweet_id_dict[ line['id']]]['topsy']['url'] = line['url'] tweetdata_list[tweet_id_dict[line['id']]]['topsy'][ 'file_counter'] = file_counter tweetdata_list[tweet_id_dict[line['id']]]['topsy'][ 'short_file_name'] = short_file_name # ======================================= # add geo data to Twitter's json response # ======================================= if geocode: # give everybody a blank for idx in range(len(tweetdata_list)): tweetdata_list[idx]["user"][ "location_geoinfo"] = {} # create a list of locations to send to MapQuest loc_url = '' for tweet_loc in tweet_loc_list: loc_url = loc_url + '&location=' + tweet_loc # send 'em urllib.urlretrieve(mapq_url + loc_url, "batch.json") # get the answer batch = open("batch.json", "r") lines = batch.readlines() batch.close() # what they send back has superfluous stuff at the front and back ends match = [] if lines: match = re.search(r"renderBatch\((\{.*\})", lines[0]) if match: result = match.group(1) try: locs = json.loads(result) # step through MapQuest's response and add data to Twitter's json response for results in locs['results']: if results['providedLocation'][ 'location'] in tweet_loc_dict.keys( ): dict_loc = tweet_loc_dict[results[ 'providedLocation']['location']] tweetdata_list[dict_loc]["user"][ "location_geoinfo"] = results[ 'locations'][0] if tweetdata_list[dict_loc]["user"][ "location_geoinfo"]: Geocoder_count += 1 else: logging.warning( "\nMAPQUEST KEY MISMATCH") logging.warning(tweet_loc_dict.keys()) logging.warning(results) except: msg = "MapQuest sent invalid json" print msg logging.warning(msg) logging.warning(lines) else: msg = "MapQuest sent empty response" print msg logging.warning(msg) logging.warning(lines) # process the json file and start over with a new batch from Twitter # ================================================================== process_output_file(tweetdata_list, output_filename) bulk_list = [] file_counter += 1
def get_twitter_json(list_of_filenames, starting_at=1, ending_at=0, geocode=True): """ 1) reads in a list of fully-qualified filenames from "list_of_filenames" 2) processes each row of each topsy file in the "list_of_filenames", making batched calls to Twitter to retrieve the json for each tweet adding the data from the topsy file such as "score" plus unix timestamps for the topsy firstpost_date field and Twitter's created_at field plus coordinate and place name data for Twitter's user location field - after every 13,500 rows, or whenever there is a threshold-exceeded error the program goes to sleep for 15 minutes. On my quad i7 4G Windows 7 64-bit machine, with my Comcast Internet connection, I process about 1,700 tweets per minute. The sleep time for the Twitter threshold obviously increases the elapsed time, yielding roughly 8 + 15 = 23 minutes elapsed time per 13,500 tweets. For 2,500,000 tweets, that's 2,500,000/13,500*23 = 4259 minutes/71 hours/3 days elapsed time for all of the tweets in the project. Note: a file named twitter_credentials.py must be in the folder with the code see the repo: it contains your Twitter credentials Note: if geocode=True a file named mapquest_key.txt must be in the folder with the code get a MapQuest key here: http://developer.mapquest.com/ Note: if ["user"] is embedded in ["retweeted_status"] I do not get the location info This, plus problems like blank or incomprehensible ['location'] fields puts the geo-tagging rate at 75%. Input: list_of_filenames a text file with fully-qualified file names starting_at the line number of "list_of_filenames" where processing should start ending_at if 0 process all files beginning with the "starting_at" line in "list_of_filenames" if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames" geocode if True, batched requests are made to the MapQuest Developers API for coordinate and place name data if False, these call are not made and no geo info is added Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number Usage: %run get_twitter_json.py "filename_list.csv" 2 2 A message like "6 skipped id 448176144668721152" means that Twitter failed to return any data about a tweet with id 448... and that this is the 6th instance of this. Fewer than 1% are skipped. To use the output file in python: ================================= import json tweet_file = open("../files/bigtweet_file002.json", "r") for line in tweet_file: tweet = json.loads(str(line)) if tweet['retweet_count'] > 100: print "\n\n%d %s\n%s"%(tweet['retweet_count'], tweet['user']['name'], tweet['text']) To use the output file in R: ============================ library(rjson) file_path = ("../files/bigtweet_file002.json") tweet_list = fromJSON(sprintf("[%s]", paste(readLines(file_path),collapse=","))) for (i in 1:length(tweet_list)){ if (tweet_list[[i]]$retweet_count > 100){ cat(sprintf("\n\n%d %s\n%s",tweet_list[[i]]$retweet_count, tweet_list[[i]]$user$name, tweet_list[[i]]$text)) } } ## convert to twitteR structure library(twitteR) tweets = import_statuses(raw_data=tweet_list) To store in MongoDB using python: ================================= # create a python list of each tweet import json tweet_file = open("../files/bigtweet_file002.json", "r") tweet_list = [json.loads(str(line)) for line in tweet_file] # store the list in MongoDB from pymongo import MongoClient client = MongoClient() db = client['file002'] posts = db.posts #db.posts.remove( { } ) # delete if previously created posts.insert(tweet_list) # same example as above for result in db.posts.find({ "retweet_count": { "$gt": 100 } }): print "%d %s\n%s"%(result['retweet_count'],result['user']['name'],result['text']) """ import csv, json import re import time, datetime import sys, os import urllib2,urllib import os.path from twitter_functions import lookup_multiple_tweets # convert input parameter strings to integer starting_at = int(starting_at) ending_at = int(ending_at) process_start = datetime.datetime.now() print "\n================================" print "process start: %s"%process_start.strftime("%c") print "================================\n" sys.stdout.flush() # read the list of filenames into "filename_list" # =============================================== filename_list = [] with open(list_of_filenames, "rb") as namefile: csv_reader = csv.reader(namefile) for row in csv_reader: filename_list.extend(row) output_filename = "bigtweet_file" + "%03d"%(starting_at,) + ".json" step = 100 # we're going to process in groups of "step" bulk_list = [] # batch of rows from input file list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file # the Twitter rate limits are documented here # https://dev.twitter.com/docs/rate-limiting/1.1/limits sleep_batch = 13500 # we sleep after this many lines processed sleep_batch_rows = 0 # the number of lines we've processes since the last sleep # MapQuest Developer API documentation: http://developer.mapquest.com/ Geocoder_count = 0 # how many records did did we Geocode? if geocode: f = open('mapquest_key.txt','r') key = f.readline() f.close() mapq_url = 'http://www.mapquestapi.com/geocoding/v1/batch?key=' mapq_url = mapq_url + key + '&outFormat=json&maxResults=1&callback=renderBatch' number_of_files = len(filename_list) # how many files in the list file_counter = 1 # which one is this one global first_sleep first_sleep = True # first time through, we write an output_file header invalid_json = False # in case Twitter sends us junk global total_processed total_processed = 0 # how many rows have we processed skip_counter = 0 # how many rows did we skip because Twitter didn't send us info # read each file in and process it # ================================== for input_filename in filename_list: # skip the first "starting_at-1" files if file_counter < starting_at: print "Skipping %d of %d %s"%(file_counter, number_of_files, input_filename) file_counter+=1 continue if ending_at != 0: number_of_files = ending_at # find the shortened file name # # note: if your filenames do not fit my convention # replace the two lines below with # # short_file_name = input_filename # match = re.search(r"Twitter Data\\(.*)", input_filename) short_file_name = match.group(1) # stop if we're beyond "ending_at" if ending_at > 0: if file_counter > ending_at: print "Ending before %d of %d %s"%(file_counter, number_of_files, input_filename) break # check that the file exists if not os.path.isfile(input_filename): print "%s does not exist"%input_filename file_counter+=1 continue # open an input file with open(input_filename, "rb" ) as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file print "\n--Processing %d of %d %s rows %d"%(file_counter, number_of_files, short_file_name,totallines) sys.stdout.flush() # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # sleep if we're over the limit of lines processed sleep_batch_rows+=1 if sleep_batch_rows > sleep_batch: print "sleeping after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(output_dict, output_filename) # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] # make sure tweet_id is actually numeric if re.match(r"^\d+", tweet_id): # Successful match at the start of the string row['id'] = tweet_id bulk_list.append(row) list_of_tweet_ids.append(tweet_id) else: print "tweet url terminated with non-numeric in line %d"%(linenum+1) print row['url'] # if batch-size reached, process the batch if len(list_of_tweet_ids) >= step or (linenum+1) >= totallines: # make a batch request to Twitter # =============================== result = lookup_multiple_tweets(list_of_tweet_ids) list_of_tweet_ids = [] for foo in result: try: tweetdata_list = json.loads(foo) break except ValueError, e: print "\nTwitter returned invalid json" print e print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) bulk_list = [] invalid_json = True break if invalid_json: invalid_json = False break # if Twitter returns an error if 'errors' in tweetdata_list: print "Twitter returned an error message:" print "message: " + str(tweetdata_list["errors"][0]['message']) print "code: " + str(tweetdata_list["errors"][0]['code']) print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(tweetdata_list, output_filename) bulk_list = [] # we lose the batch continue # Twitter's response is in arbitrary order and doesn't necessarily # contain a response for every id we requested # # So we create a dictionary for the tweetdata_list # associating id's with their position in the list # and a list of id's for searching tweet_id_dict = {} tweet_id_list = [] tweet_loc_dict = {} tweet_loc_list = [] # save every id in tweetdata_list and its position for i in range(len(tweetdata_list)): id = str(tweetdata_list[i]['id']) tweet_id_dict[id] = i tweet_id_list.append(id) # save every location and its position if tweetdata_list[i]['user']['location'] is not None and tweetdata_list[i]['user']['location'].strip() != "": try: loc = str(tweetdata_list[i]['user']['location']) tweet_loc_dict[loc] = i tweet_loc_list.append(loc) except: pass # pull each of the lines and its corresponding Twitter response for line in bulk_list: if line['id'] not in tweet_id_list: skip_counter+=1 # check the entire line['id'] is numeric if re.match(r"^\d+", line['id']): # yes print "%d skipped id %d"%(skip_counter, int(line['id'])) else: # no print skip_counter print "line['id'] is not all numeric" print line['id'] continue tweetdata = tweetdata_list[tweet_id_dict[line['id']]] if str(line['id']) != str(tweetdata['id']): skip_counter+=1 print "id mismatch, skipping %d"%(skip_counter) print "line id %s"%(str(line['id'])) print "tweet id %s"%(str(tweetdata['id'])) continue # =========================================== # add Topsy fields to Twitter's json response # =========================================== # add a timestamp for 'created_at' # time.ctime(tweet['timestamp']) # will decode this field tweetdata_list[tweet_id_dict[line['id']]]['timestamp'] = \ time.mktime(datetime.datetime.strptime(tweetdata_list[tweet_id_dict[line['id']]]['created_at'], '%a %b %d %H:%M:%S +0000 %Y').timetuple()) tweetdata_list[tweet_id_dict[line['id']]]['topsy'] = {} # add a timestamp for topsy's 'firstpost_date' try: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \ time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%Y").timetuple()) except: try: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \ time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%y").timetuple()) except: tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = "" # add the topsy csv file fields to the Twitter json tweetdata_list[tweet_id_dict[line['id']]]['topsy']['firstpost_date'] = line['firstpost_date'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['score'] = float(line['score']) tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_nick'] = line['trackback_author_nick'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_url'] = line['trackback_author_url'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_permalink'] = line['trackback_permalink'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['url'] = line['url'] tweetdata_list[tweet_id_dict[line['id']]]['topsy']['file_counter'] = file_counter tweetdata_list[tweet_id_dict[line['id']]]['topsy']['short_file_name'] = short_file_name # ======================================= # add geo data to Twitter's json response # ======================================= if geocode: # give everybody a blank for idx in range(len(tweetdata_list)): tweetdata_list[idx]["user"]["location_geoinfo"] = {} # create a list of locations to send to MapQuest loc_url = '' for tweet_loc in tweet_loc_list: loc_url = loc_url + '&location=' + tweet_loc # send 'em urllib.urlretrieve (mapq_url + loc_url, "batch.json") # get the answer batch = open("batch.json","r") lines = batch.readlines() batch.close() # what they send back has superfluous stuff at the front and back ends try: locs = json.loads(lines[0][12:-1]) # step through MapQuest's response and add data to Twitter's json response for results in locs['results']: if results['providedLocation']['location'] in tweet_loc_dict.keys(): dict_loc = tweet_loc_dict[results['providedLocation']['location']] tweetdata_list[dict_loc]["user"]["location_geoinfo"] = results['locations'][0] Geocoder_count += 1 except: print "MapQuest sent invalid json" # process the json file and start over with a new batch from Twitter # ================================================================== process_output_file(tweetdata_list, output_filename) bulk_list = [] file_counter+=1
def add_twitter_data_bulk(input_filename): """ reads in a *.csv file from the Coursolve Healthcare Twitter Analysis project and produces an output *.csv file with a number of Twitter fields added The name of the output file is the name of the input file with "_full" appended Notes: 1. "twitter_functions.py" must be in your folder or somewhere on your path 2. You must provide your own file named "twitter_credentials.py" (see https://apps.twitter.com/) written like this: def twitter_credentials(): api_key = " your credentials " api_secret = " your credentials " access_token_key = " your credentials " access_token_secret = " your credentials " return (api_key,api_secret,access_token_key,access_token_secret) 3. You need to be aware that Twitter throttles your activity. This function makes bulk calls to Twitter to try to increase our throughput over add_twitter_data.py which makes one call to Twitter for every line 4. IPython usage: (1) from add_twitter_data_bulk import add_twitter_data_bulk add_twitter_data_bulk("Tweets_BleedingDisorders.csv") (2) %run add_twitter_data_bulk.py "Tweets_BleedingDisorders.csv" 5. If you have problems, I'll try to help ... [email protected] """ import csv import json from twitter_functions import lookup_multiple_tweets from twitter_functions import parse_tweet_json output_filename = input_filename.split(".")[0] + "_full.csv" step = 95 # we're going to process in groups of "step" bulk_list = [] # batch of rows from input file list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file with open(input_filename, "rb") as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file print "Rows in file: " + str(totallines) # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] row['id'] = tweet_id bulk_list.append(row) list_of_tweet_ids.append(tweet_id) # process the batch # ================= if len(bulk_list) >= step or (linenum + 1) >= totallines: # make a batch request to Twitter result = lookup_multiple_tweets(list_of_tweet_ids) list_of_tweet_ids = [] for foo in result: tweetdata_list = json.loads(foo) break # if twitter returns an error # print the error # break => jump to output file processing if 'errors' in tweetdata_list: print "\nTwitter returned an error message:" print "message: " + tweetdata_list["errors"][0]['message'] print "code: " + str( tweetdata_list["errors"][0]['code']) print "\nIf the message is 'Rate limit exceeded', see\nhttps://dev.twitter.com/docs/rate-limiting/1.1" print "It basically seems to mean you have to wait 15 minutes" import datetime from datetime import timedelta timenow = datetime.datetime.today().strftime("%H:%M:%S") timeplus15 = (datetime.datetime.today() + timedelta(minutes=15)).strftime("%H:%M:%S") print " time now: " + timenow + "\n time in 15 minutes: " + timeplus15 print "\nAny rows of " + input_filename + " that were processed up to this point should be in the output file\n" break # Twitter's response is in an arbitrary order so sort both lists by id bulk_list = sorted(bulk_list, key=lambda k: k['id']) tweetdata_list = sorted(tweetdata_list, key=lambda k: k['id']) if len(bulk_list) != len(tweetdata_list): print "\nTwitter returned a different number of responses than we requested" print "linenum: " + str(linenum) print "Requested: " + str(len(bulk_list)) print "Received: " + str(len(tweetdata_list)) for line, tweetdata in zip(bulk_list, tweetdata_list): if str(tweetdata['id']) != str(line['id']): print "\nmismatch in ids, skipping remaining rows in this batch" print "tweetdata['id']=" + str(tweetdata['id']) print "line['id']= " + str(line['id']) break parse_tweet_json(line, tweetdata) output_dict.append(line) print "Rows processed: " + str(len(output_dict)) bulk_list = [] # create the output file # ====================== if output_dict: f = open(output_filename, 'wb') w = csv.DictWriter(f, delimiter=",", fieldnames=output_dict[0].keys()) w.writeheader() w.writerows(output_dict) f.close() print output_filename + " has been created" else: print output_filename + " was NOT created"
def create_bulkfile(list_of_filenames, starting_at=1, ending_at=0): """ - reads in a list of fully-qualified filenames from "list_of_filenames" I'm expecting file names to have the Windows Google Drive structure, for example ... Twitter Data\June\Cardiovasucular\Tweets_AFib.csv the code is commented with a simple solution you can implement to allow you to have any arbitrary fully-qualified filename, for any operating system - processes each row of each file in the file list, making batched calls to Twitter to retrieve the data for each tweet - after every 13,500 rows, or whenever there is a threshold-exceeded error the output_file is written and the program goes to sleep for 15 minutes. Note: AFINN-111.txt must be in the same folder you can use it as is or include your own n-grams the 'sentiment' field is the sum of the scores of all the n-grams found Input: list_of_filenames a text file with fully-qualified file names starting_at the line number of "list_of_filenames" where processing should start ending_at if 0 process all files beginning with the "starting_at" line in "list_of_filenames" if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames" Output: a csv file named "bigtweet_filexxx.csv", where xxx is the "starting_at" number Usage: %run create_bulkfile.py "filename_list.csv" 1 0 A message like "263 skipped id 463811853097787392" indicates that Twitter did not return data for a tweet with the id of 463811853097787392 and this is the 263rd instance of this. As a result of this and other less-common errors the output file will have fewer rows than the total rows in the input files. """ import csv import json import re import time import sys import six import datetime from twitter_functions import lookup_multiple_tweets from twitter_functions import parse_AFINN # convert input parameter strings to integer starting_at = int(starting_at) ending_at = int(ending_at) process_start = datetime.datetime.now() print "\n================================" print "process start: %s" % process_start.strftime("%c") print "================================\n" # read the list of filenames into "filename_list" # =============================================== filename_list = [] with open(list_of_filenames, "rb") as namefile: csv_reader = csv.reader(namefile) for row in csv_reader: filename_list.extend(row) output_filename = "bigtweet_file" + "%03d" % (starting_at, ) + ".csv" step = 100 # we're going to process in groups of "step" bulk_list = [] # batch of rows from input file list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file # the Twitter rate limits are documented here # https://dev.twitter.com/docs/rate-limiting/1.1/limits sleep_batch = 13500 # we sleep after this many lines processed sleep_batch_rows = 0 # the number of lines we've processes since the last sleep number_of_files = len(filename_list) # how many files in the list file_counter = 1 # which one is this one first_sleep = True # first time through, we write an output_file header invalid_json = False # in case Twitter sends us junk skip_counter = 0 # how many rows did we skip because Twitter didn't send us info # read in the n-grams for sentiment processing sentiment_words, sentiment_phrases = parse_AFINN("AFINN-111.txt") # read each file in and process it # ================================== for input_filename in filename_list: # skip the first "starting_at-1" files if file_counter < starting_at: print "Skipping %d of %d %s" % (file_counter, number_of_files, input_filename) file_counter += 1 continue if ending_at != 0: number_of_files = ending_at # find the shortened file name # # note: if your filenames do not fit my convention # replace the two lines below with # # short_file_name = input_filename # match = re.search(r"Twitter Data\\(.*)", input_filename) short_file_name = match.group(1) # stop if we're beyond "ending_at" if ending_at > 0: if file_counter > ending_at: print "Ending before %d of %d %s" % ( file_counter, number_of_files, input_filename) break # open an input file with open(input_filename, "rb") as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file print "\n--Processing %d of %d %s rows %d" % ( file_counter, number_of_files, short_file_name, totallines) # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # sleep if we're over the limit of lines processed sleep_batch_rows += 1 if sleep_batch_rows > sleep_batch: print "sleeping after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(output_dict, output_filename, first_sleep) # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] # make sure tweet_id is actually numeric if re.match(r"^\d+", tweet_id): # Successful match at the start of the string row['id'] = tweet_id bulk_list.append(row) list_of_tweet_ids.append(tweet_id) else: print "tweet url terminated with non-numeric in line %d" % ( linenum + 1) print row['url'] # if batch-size reached, process the batch if len(bulk_list) >= step or (linenum + 1) >= totallines: # make a batch request to Twitter # =============================== while True: result = lookup_multiple_tweets(list_of_tweet_ids) if result: break print "\nTwitter returned an empty result\n" time.sleep(1) list_of_tweet_ids = [] for foo in result: try: tweetdata_list = json.loads(foo) break except ValueError, e: print "\nTwitter returned invalid json" print e print "after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) bulk_list = [] invalid_json = True break if invalid_json: invalid_json = False break # if Twitter returns an error # # better process # try: # statuses = api.GetUserTimeline(u.id) # print [s.text for s in statuses] # except TwitterError, t: # print t if 'errors' in tweetdata_list: print "Twitter returned an error message:" print "message: " + str( tweetdata_list["errors"][0]['message']) print "code: " + str( tweetdata_list["errors"][0]['code']) print "after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(output_dict, output_filename, first_sleep) bulk_list = [] # we lose the batch continue # Twitter's response is in arbitrary order and doesn't necessarily # contain a response for every id we requested # # So we create a dictionary for the tweetdata_list # associating id's with their position in the list # and a list of id's for searching tweet_id_dict = {} tweet_id_list = [] # find every id in tweetdata_list and its position for i in range(len(tweetdata_list)): id = str(tweetdata_list[i]['id']) tweet_id_dict[id] = i tweet_id_list.append(id) # pull each of the lines and its corresponding Twitter response batch_process_count = 0 for line in bulk_list: if line['id'] not in tweet_id_list: skip_counter += 1 # check the entire line['id'] is numeric if re.match(r"^\d+", line['id']): # yes print "%d skipped id %d" % (skip_counter, int(line['id'])) else: # no print skip_counter print "line['id'] is not all numeric" print line['id'] continue tweetdata = tweetdata_list[tweet_id_dict[line['id']]] if str(line['id']) != str(tweetdata['id']): skip_counter += 1 print "id mismatch, skipping %d" % (skip_counter) print "line id %s" % (str(line['id'])) print "tweet id %s" % (str(tweetdata['id'])) continue # parse Twitter's response line["file_counter"] = file_counter line["short_file_name"] = short_file_name line = parse_tweet_json(line, tweetdata) line['sentiment'] = find_sentiment( tweetdata, sentiment_words, sentiment_phrases) output_dict.append(line) batch_process_count += 1 print "Rows processed: " + str(len(output_dict)) bulk_list = [] file_counter += 1
def create_jsonfile(list_of_filenames, starting_at=1, ending_at=0): """ - reads in a list of fully-qualified filenames from "list_of_filenames" - processes each row of each file in the file list, making batched calls to Twitter to retrieve the data for each tweet - after every 13,500 rows, or whenever there is a threshold-exceeded error the output_file is written and the program goes to sleep for 15 minutes. Input: list_of_filenames a text file with fully-qualified file names starting_at the line number of "list_of_filenames" where processing should start ending_at if 0 process all files beginning with the "starting_at" line in "list_of_filenames" if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames" Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number Usage: %run create_jsonfile.py "filename_list.csv" 1 0 To use the output file in python: ================================= import json tweet_file = open("../files/bigtweet_file003.json", "r") for line in tweet_file: tweet = json.loads(str(line)) if tweet['retweet_count'] > 100: print "\n\n%d %s\n%s"%(tweet['retweet_count'], tweet['user']['name'], tweet['text']) To use the output file in R: ============================ library(rjson) file_path = ("../files/bigtweet_file003.json") tweet_list = fromJSON(sprintf("[%s]", paste(readLines(file_path),collapse=","))) for (i in 1:length(tweet_list)){ if (tweet_list[[i]]$retweet_count > 100){ cat(sprintf("\n\n%d %s\n%s",tweet_list[[i]]$retweet_count, tweet_list[[i]]$user$name, tweet_list[[i]]$text)) } } ## convert to twitteR structure library(twitteR) tweets = import_statuses(raw_data=tweet_list) To store in MongoDB using python: ================================= # create a python list of each tweet import json tweet_file = open("../files/bigtweet_file003.json", "r") tweet_list = [json.loads(str(line)) for line in tweet_file] # store the list in MongoDB from pymongo import MongoClient client = MongoClient() db = client['file003'] posts = db.posts #db.posts.remove( { } ) # delete if previously created posts.insert(tweet_list) # same example as above for result in db.posts.find({ "retweet_count": { "$gt": 100 } }): print "%d %s\n%s"%(result['retweet_count'],result['user']['name'],result['text']) """ import csv import json import re import time import sys import datetime from twitter_functions import lookup_multiple_tweets # convert input parameter strings to integer starting_at = int(starting_at) ending_at = int(ending_at) process_start = datetime.datetime.now() print "\n================================" print "process start: %s" % process_start.strftime("%c") print "================================\n" # read the list of filenames into "filename_list" # =============================================== filename_list = [] with open(list_of_filenames, "rb") as namefile: csv_reader = csv.reader(namefile) for row in csv_reader: filename_list.extend(row) output_filename = "bigtweet_file" + "%03d" % (starting_at, ) + ".json" step = 100 # we're going to process in groups of "step" list_of_tweet_ids = [] # tweet ids of these rows output_dict = [] # list of dicts to send to output file # the Twitter rate limits are documented here # https://dev.twitter.com/docs/rate-limiting/1.1/limits sleep_batch = 13500 # we sleep after this many lines processed sleep_batch_rows = 0 # the number of lines we've processes since the last sleep number_of_files = len(filename_list) # how many files in the list file_counter = 1 # which one is this one global first_sleep first_sleep = True # first time through, we write an output_file header invalid_json = False # in case Twitter sends us junk global total_processed total_processed = 0 # how many rows have we processed # read each file in and process it # ================================== for input_filename in filename_list: # skip the first "starting_at-1" files if file_counter < starting_at: print "Skipping %d of %d %s" % (file_counter, number_of_files, input_filename) file_counter += 1 continue if ending_at != 0: number_of_files = ending_at # find the shortened file name # # note: if your filenames do not fit my convention # replace the two lines below with # # short_file_name = input_filename # match = re.search(r"Twitter Data\\(.*)", input_filename) short_file_name = match.group(1) # stop if we're beyond "ending_at" if ending_at > 0: if file_counter > ending_at: print "Ending before %d of %d %s" % ( file_counter, number_of_files, input_filename) break # open an input file with open(input_filename, "rb") as infile: reader = csv.DictReader(infile) lines = list(reader) # list of all lines/rows in the input file totallines = len(lines) # number of lines in the input file print "\n--Processing %d of %d %s rows %d" % ( file_counter, number_of_files, short_file_name, totallines) # read the input file line-by-line # ================================ for linenum, row in enumerate(lines): # sleep if we're over the limit of lines processed sleep_batch_rows += 1 if sleep_batch_rows > sleep_batch: print "sleeping after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(output_dict, output_filename) # accumulate a batch of rows from the input file # ============================================== tweet_id = row['url'].split("/")[-1] # make sure tweet_id is actually numeric if re.match(r"^\d+", tweet_id): # Successful match at the start of the string row['id'] = tweet_id list_of_tweet_ids.append(tweet_id) else: print "tweet url terminated with non-numeric in line %d" % ( linenum + 1) print row['url'] # if batch-size reached, process the batch if len(list_of_tweet_ids) >= step or (linenum + 1) >= totallines: # make a batch request to Twitter # =============================== result = lookup_multiple_tweets(list_of_tweet_ids) list_of_tweet_ids = [] for foo in result: try: tweetdata_list = json.loads(foo) break except ValueError, e: print "\nTwitter returned invalid json" print e print "after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) invalid_json = True break if invalid_json: invalid_json = False break # if Twitter returns an error # # better process # try: # statuses = api.GetUserTimeline(u.id) # print [s.text for s in statuses] # except TwitterError, t: # print t if 'errors' in tweetdata_list: print "Twitter returned an error message:" print "message: " + str( tweetdata_list["errors"][0]['message']) print "code: " + str( tweetdata_list["errors"][0]['code']) print "after %d lines of file %d of %d %s" % ( linenum, file_counter, number_of_files, short_file_name) sleep_batch_rows = 0 sleep_process(tweetdata_list, output_filename) continue process_output_file(tweetdata_list, output_filename) file_counter += 1