def get_twitter_json(list_of_filenames, starting_at=1, ending_at=0, geocode=True):
    """
    1) reads in a list of fully-qualified filenames from "list_of_filenames"
        
    2) processes each row of each topsy file in the "list_of_filenames", 
       making batched calls to Twitter to retrieve the json for each tweet
           adding the data from the topsy file such as "score"
           plus unix timestamps for the topsy firstpost_date field and Twitter's created_at field
           plus coordinate and place name data for Twitter's user location field
    
    - after every 13,500 rows, or whenever there is a threshold-exceeded error
      the program goes to sleep for 15 minutes.
      
    Note: a file named twitter_credentials.py must be in the folder with the code
          see the repo: it contains your Twitter credentials
    
    Note: if geocode=True a file named mapquest_key.txt must be in the folder with the code
          get a MapQuest key here: http://developer.mapquest.com/
   
      
    Input: list_of_filenames   a text file with fully-qualified file names
           starting_at         the line number of "list_of_filenames" where processing should start
           ending_at           if 0   process all files beginning with the "starting_at" line in "list_of_filenames"
                               if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames"
           geocode             if True, batched requests are made to the MapQuest Developers API for coordinate and place name data
                               if False, these call are not made and no geo info is added
                                 
           
    Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number
        
    Usage: %run get_twitter_json.py "filename_list.csv" 2 2
                          - or -
           nohup python get_twitter_json.py "filename_list.csv" 1 0 &
    
    A message like "6 skipped id 448176144668721152" means that Twitter failed to return any data about 
    a tweet with id 448... and that this is the 6th instance of this. 
    """
    
    import csv, json
    import re
    import time, datetime
    import sys, os
    import urllib2,urllib
    import os.path
    from twitter_functions import lookup_multiple_tweets
    
    # convert input parameter strings to integer
    starting_at = int(starting_at) 
    ending_at   = int(ending_at)
    geocode     = bool(geocode)
    msg = "\nlist_of_filenames %s; starting_at %d; ending_at %d; geocode %d"%(list_of_filenames,starting_at,ending_at,geocode)
    logging.info(msg)
    
    process_start = datetime.datetime.now()
    msg = "\n=======================================\nprocess start: %s"%process_start.strftime("%c") + \
          "\n=======================================\n"
    print msg
    sys.stdout.flush()
    logging.info(msg)
    
    # read the list of filenames into "filename_list"
    # ===============================================
    filename_list = []
    with open(list_of_filenames, "rb") as namefile:
        csv_reader = csv.reader(namefile)
        for row in csv_reader:
            filename_list.extend(row)
    
    output_filename   = "bigtweet_file" + "%03d"%(starting_at,) + ".json"
    step              = 100 # we're going to process in groups of "step"
    bulk_list         = []  # batch of rows from input file
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict       = []  # list of dicts to send to output file
    
    # the Twitter rate limits are documented here
    # https://dev.twitter.com/docs/rate-limiting/1.1/limits
    sleep_batch       = 13500 # we sleep after this many lines processed
    sleep_batch_rows  = 0     # the number of lines we've processes since the last sleep
    
    # MapQuest Developer API documentation: http://developer.mapquest.com/
    Geocoder_count    = 0     # how many records did did we Geocode?
    if geocode:
        f = open('mapquest_key.txt','r')
        key = f.readline()
        f.close()
        mapq_url = 'http://www.mapquestapi.com/geocoding/v1/batch?key='
        mapq_url = mapq_url + key + '&outFormat=json&maxResults=1&callback=renderBatch'
        logging.info("MAPQUEST URL " + mapq_url)

    
    number_of_files   = len(filename_list) # how many files in the list
    file_counter      = 1                  # which one is this one
    global first_sleep
    first_sleep       = True               # first time through, we write an output_file header
    invalid_json      = False              # in case Twitter sends us junk
    global total_processed
    total_processed   = 0                  # how many rows have we processed
    skip_counter      = 0                  # how many rows did we skip because Twitter didn't send us info
    
    # read each file in and process it
    # ==================================
    for input_filename in filename_list:
        
        # skip the first "starting_at-1" files
        if file_counter < starting_at:
            msg = "Skipping %d of %d %s"%(file_counter, number_of_files, input_filename)
            print msg
            logging.info(msg)
            file_counter+=1
            continue  
            
        if ending_at != 0: number_of_files = ending_at
            
        # find the shortened file name
        #
        # note: if your filenames do not fit my convention
        #       replace the two lines below with
        #
        #       short_file_name = input_filename
        #
        #match = re.search(r"Twitter Data\\(.*)", input_filename) # Windows Google Drive
        match = re.search("/home/ubuntu/files(.*)", input_filename) # AWS Ubuntu
        short_file_name = match.group(1)  

        # stop if we're beyond "ending_at"
        if ending_at > 0:
            if file_counter > ending_at:
                msg = "Ending before %d of %d %s"%(file_counter, number_of_files, input_filename)
                print msg
                logging.info(msg)
                break
                
        # check that the file exists
        if not os.path.isfile(input_filename):
            msg = "%s does not exist"%input_filename
            print msg
            logging.info(msg)
            file_counter+=1
            continue
        
        # open an input file
        with open(input_filename, "rb" ) as infile:
            reader     = csv.DictReader(infile)
            lines      = list(reader) # list of all lines/rows in the input file
            totallines = len(lines)   # number of lines in the input file
            
            msg = "\n--Processing %d of %d %s rows %d"%(file_counter, number_of_files, short_file_name,totallines)
            print msg
            logging.info(msg)
            sys.stdout.flush()
            
            # read the input file line-by-line
            # ================================
            for linenum, row in enumerate(lines):
                
                # sleep if we're over the limit of lines processed
                sleep_batch_rows+=1
                if sleep_batch_rows > sleep_batch:
                    msg = "sleeping after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                    print msg
                    logging.info(msg)
                    sleep_batch_rows = 0
                    sleep_process(output_dict, output_filename)
                    
                # accumulate a batch of rows from the input file
                # ==============================================
                tweet_id  = row['url'].split("/")[-1]
                # make sure tweet_id is actually numeric
                if re.match(r"^\d+", tweet_id):
                    # Successful match at the start of the string
                    row['id'] = tweet_id
                    bulk_list.append(row)
                    list_of_tweet_ids.append(tweet_id)
                else:
                    msg = "tweet url terminated with non-numeric in line %d"%(linenum+1)
                    print msg
                    logging.info(msg)
                    print row['url']
                    logging.info(row['url'])
                
                # if batch-size reached, process the batch
                if len(list_of_tweet_ids) >= step or (linenum+1) >= totallines:
                   
                    # make a batch request to Twitter 
                    # ===============================
                    result = lookup_multiple_tweets(list_of_tweet_ids)
                        
                    list_of_tweet_ids = []
                    
                    for foo in result:
                        try:
                            tweetdata_list = json.loads(foo)
                            break
                        except ValueError, e:
                            msg = "\nTwitter returned invalid json"
                            print msg
                            logging.info(msg)
                            print e
                            logging.info(e)
                            msg = "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                            print msg
                            logging.info(msg)
                            bulk_list = []
                            invalid_json = True
                            break
                            
                    if invalid_json:
                        invalid_json = False
                        break
                        
                    # if Twitter returns an error
                    if 'errors' in tweetdata_list:
                        msg = "Twitter returned an error message:\n" + \
                              "message: " + str(tweetdata_list["errors"][0]['message']) + \
                              "\ncode:    " + str(tweetdata_list["errors"][0]['code']) + \
                              "\nafter %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                        print msg
                        logging.info(msg)
                        sleep_batch_rows = 0
                        sleep_process(tweetdata_list, output_filename)
                        bulk_list = [] # we lose the batch
                        continue
                    
                    # Twitter's response is in arbitrary order and doesn't necessarily
                    # contain a response for every id we requested
                    #
                    # So we create a dictionary for the tweetdata_list
                    # associating id's with their position in the list
                    # and a list of id's for searching
                    
                    tweet_id_dict = {}
                    tweet_id_list = []
                    
                    tweet_loc_dict = {}
                    tweet_loc_list = []
                    
                    # save every id in tweetdata_list and its position
                    for i in range(len(tweetdata_list)):
                        id = str(tweetdata_list[i]['id'])
                        tweet_id_dict[id] = i
                        tweet_id_list.append(id)
                        
                        # save every location and its position
                        if tweetdata_list[i]['user']['location'] is not None and tweetdata_list[i]['user']['location'].strip() != "":
                            try:
                                loc = str(tweetdata_list[i]['user']['location'])
                                tweet_loc_dict[loc] = i
                                tweet_loc_list.append(loc)
                            except:
                                pass
                        
                    # pull each of the lines and its corresponding Twitter response
                    for line in bulk_list:
                        if line['id'] not in tweet_id_list:
                            skip_counter+=1
                            # check the entire line['id'] is numeric
                            if re.match(r"^\d+", line['id']):
                                # yes
                                msg = "%d skipped id %d"%(skip_counter, int(line['id']))
                                print msg
                                logging.info(msg)
                            else:
                                # no
                                print skip_counter
                                logging.info(skip_counter)
                                msg = "line['id'] is not all numeric"
                                print msg
                                logging.info(msg)
                                print line['id']            
                                logging.info(line['id'])
                            continue
                            
                        tweetdata = tweetdata_list[tweet_id_dict[line['id']]]
                        if str(line['id']) != str(tweetdata['id']):
                            skip_counter+=1
                            msg = "id mismatch, skipping %d"%(skip_counter) + \
                                   "\nline  id %s"%(str(line['id'])) + \
                                   "\ntweet id %s"%(str(tweetdata['id']))
                            print msg
                            logging.info(msg)
                            continue
                            
                        # ===========================================
                        # add Topsy fields to Twitter's json response
                        # ===========================================
                            
                        # add a timestamp for 'created_at'
                        # time.ctime(tweet['timestamp']) # will decode this field
                        tweetdata_list[tweet_id_dict[line['id']]]['timestamp'] = \
                            time.mktime(datetime.datetime.strptime(tweetdata_list[tweet_id_dict[line['id']]]['created_at'], '%a %b %d %H:%M:%S +0000 %Y').timetuple())
                            
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'] = {}
                        # add a timestamp for topsy's 'firstpost_date' 
                        try: 
                            tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \
                                time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%Y").timetuple())
                        except:
                            try:
                                tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \
                                    time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%y").timetuple())
                            except:
                                tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = ""
                                
                        # add the topsy csv file fields to the Twitter json
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['firstpost_date']        = line['firstpost_date']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['score']                 = float(line['score'])
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_nick'] = line['trackback_author_nick']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_url']  = line['trackback_author_url']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_permalink']   = line['trackback_permalink']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['url']                   = line['url']
                        
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['file_counter']          = file_counter
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['short_file_name']       = short_file_name
                        
                    # =======================================
                    # add geo data to Twitter's json response
                    # =======================================
                    
                    if geocode:
                        # give everybody a blank 
                        for idx in range(len(tweetdata_list)):
                            tweetdata_list[idx]["user"]["location_geoinfo"] = {}
                            
                        # create a list of locations to send to MapQuest
                        loc_url = ''
                        for tweet_loc in tweet_loc_list:
                            loc_url = loc_url + '&location=' + tweet_loc
                        # send 'em
                        urllib.urlretrieve (mapq_url + loc_url, "batch.json")
                        # get the answer
                        batch = open("batch.json","r")
                        lines = batch.readlines()
                        batch.close()
                        
                        # what they send back has superfluous stuff at the front and back ends
                        match = []
                        if lines: match = re.search(r"renderBatch\((\{.*\})", lines[0])
                        if match:
                            result = match.group(1)
                            try:
                                locs = json.loads(result)
                                
                                # step through MapQuest's response and add data to Twitter's json response
                                for results in locs['results']:
                                    if results['providedLocation']['location'] in tweet_loc_dict.keys():
                                    #===========================
                                    # an idea for an alternative
                                    #===========================
                                    #needle   = results['providedLocation']['location']
                                    #haystack = tweet_loc_dict.keys()
                                    #indices  = [i for i, s in enumerate(haystack) if needle in s]
                                        dict_loc = tweet_loc_dict[results['providedLocation']['location']]
                                        tweetdata_list[dict_loc]["user"]["location_geoinfo"] = results['locations'][0]
                                        if tweetdata_list[dict_loc]["user"]["location_geoinfo"]:
                                            Geocoder_count += 1
                                    else:
                                        logging.warning("\nMAPQUEST KEY MISMATCH")
                                        logging.warning(tweet_loc_dict.keys())
                                        logging.warning(results)
                            except:
                                msg = "MapQuest sent invalid json"
                                print msg
                                logging.warning(msg)
                                logging.warning(lines)
                        else:
                            msg = "MapQuest sent empty response"
                            print msg
                            logging.warning(msg)
                            logging.warning(lines)
                        
                        
                                    
                    # process the json file and start over with a new batch from Twitter
                    # ==================================================================
                    process_output_file(tweetdata_list, output_filename)
                    bulk_list = []
                  
        file_counter+=1
def create_bulkfile(list_of_filenames, starting_at=1, ending_at=0):
    """
    - reads in a list of fully-qualified filenames from "list_of_filenames"
    
        I'm expecting file names to have the Windows Google Drive structure, for example
        ... Twitter Data\June\Cardiovasucular\Tweets_AFib.csv  
        
        the code is commented with a simple solution you can implement to allow you to have
        any arbitrary fully-qualified filename, for any operating system
        
    - processes each row of each file in the file list, 
      making batched calls to Twitter to retrieve the data for each tweet
    
    - after every 13,500 rows, or whenever there is a threshold-exceeded error
      the output_file is written and the program goes to sleep for 15 minutes.
      
    Note: AFINN-111.txt must be in the same folder
          you can use it as is or include your own n-grams
          the 'sentiment' field is the sum of the scores of all the n-grams found  
          
    Note: Requires pygeocoder
    
    Input: list_of_filenames   a text file with fully-qualified file names
           starting_at         the line number of "list_of_filenames" where processing should start
           ending_at           if 0   process all files beginning with the "starting_at" line in "list_of_filenames"
                               if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames"
           
    Output: a csv file named "bigtweet_filexxx.csv", where xxx is the "starting_at" number
        
    Usage: %run create_bulkfile.py "filename_list.csv" 1 0
    
    A message like "263 skipped id 463811853097787392" indicates that Twitter did not return data
    for a tweet with the id of 463811853097787392 and this is the 263rd instance of this. 
    As a result of this and other less-common errors the output file will have fewer rows than 
    the total rows in the input files.
    """
    import csv
    import json
    import re
    import time
    import sys
    import six
    import datetime
    from twitter_functions import lookup_multiple_tweets
    from twitter_functions import parse_AFINN
    
    # convert input parameter strings to integer
    starting_at = int(starting_at) 
    ending_at   = int(ending_at)
    
    process_start = datetime.datetime.now()
    print "\n================================"
    print "process start: %s"%process_start.strftime("%c")
    print "================================\n"
    
    # read the list of filenames into "filename_list"
    # ===============================================
    filename_list = []
    with open(list_of_filenames, "rb") as namefile:
        csv_reader = csv.reader(namefile)
        for row in csv_reader:
            filename_list.extend(row)
    
    output_filename   = "bigtweet_file" + "%03d"%(starting_at,) + ".csv"
    step              = 100 # we're going to process in groups of "step"
    bulk_list         = []  # batch of rows from input file 
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict       = []  # list of dicts to send to output file
    
    # the Twitter rate limits are documented here
    # https://dev.twitter.com/docs/rate-limiting/1.1/limits
    sleep_batch       = 13500 # we sleep after this many lines processed
    sleep_batch_rows  = 0     # the number of lines we've processes since the last sleep
    
    number_of_files   = len(filename_list) # how many files in the list
    file_counter      = 1                  # which one is this one
    first_sleep       = True               # first time through, we write an output_file header
    invalid_json      = False              # in case Twitter sends us junk
    skip_counter      = 0                  # how many rows did we skip because Twitter didn't send us info
    
    # read in the n-grams for sentiment processing
    sentiment_words, sentiment_phrases = parse_AFINN("AFINN-111.txt")
    
    # read each file in and process it
    # ==================================
    for input_filename in filename_list:
        
        # skip the first "starting_at-1" files
        if file_counter < starting_at:
            print "Skipping %d of %d %s"%(file_counter, number_of_files, input_filename)
            file_counter+=1
            continue  
            
        if ending_at != 0: number_of_files = ending_at
            
        # find the shortened file name
        #
        # note: if your filenames do not fit my convention
        #       replace the two lines below with
        #
        #       short_file_name = input_filename
        #
        match = re.search(r"Twitter Data\\(.*)", input_filename) 
        short_file_name = match.group(1)  

        # stop if we're beyond "ending_at"
        if ending_at > 0:
            if file_counter > ending_at:
                print "Ending before %d of %d %s"%(file_counter, number_of_files, input_filename)
                break
        
        # open an input file
        with open(input_filename, "rb" ) as infile:
            reader     = csv.DictReader(infile)
            lines      = list(reader) # list of all lines/rows in the input file
            totallines = len(lines)   # number of lines in the input file
            
            print "\n--Processing %d of %d %s rows %d"%(file_counter, number_of_files, short_file_name,totallines)
            
            # read the input file line-by-line
            # ================================
            for linenum, row in enumerate(lines):
                
                # sleep if we're over the limit of lines processed
                sleep_batch_rows+=1
                if sleep_batch_rows > sleep_batch:
                    print "sleeping after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                    sleep_batch_rows = 0
                    sleep_process(output_dict, output_filename, first_sleep)
                    
                # accumulate a batch of rows from the input file
                # ==============================================
                tweet_id  = row['url'].split("/")[-1]
                # make sure tweet_id is actually numeric
                if re.match(r"^\d+", tweet_id):
                    # Successful match at the start of the string
                    row['id'] = tweet_id
                    bulk_list.append(row)
                    list_of_tweet_ids.append(tweet_id)
                else:
                    print "tweet url terminated with non-numeric in line %d"%(linenum+1)
                    print row['url']
                
                # if batch-size reached, process the batch
                if len(bulk_list) >= step or (linenum+1) >= totallines:
                   
                    # make a batch request to Twitter 
                    # ===============================
                    while True:
                        result = lookup_multiple_tweets(list_of_tweet_ids)
                        if result: break
                        print "\nTwitter returned an empty result\n"
                        time.sleep(1)
                        
                    list_of_tweet_ids = []
                    for foo in result:
                        try:
                            tweetdata_list = json.loads(foo)
                            break
                        except ValueError, e:
                            print "\nTwitter returned invalid json"
                            print e
                            print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                            bulk_list = []
                            invalid_json = True
                            break
                            
                    if invalid_json:
                        invalid_json = False
                        break
                        
                    # if Twitter returns an error
                    #
                    # better process
                    # try:
                    #     statuses = api.GetUserTimeline(u.id)
                    #     print [s.text for s in statuses]
                    # except TwitterError, t:
                    #     print t
                    if 'errors' in tweetdata_list:
                        print "Twitter returned an error message:"
                        print "message: " + str(tweetdata_list["errors"][0]['message'])
                        print "code:    " + str(tweetdata_list["errors"][0]['code'])
                        print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                        sleep_batch_rows = 0
                        sleep_process(output_dict, output_filename, first_sleep)
                        bulk_list = [] # we lose the batch
                        continue
                    
                    # Twitter's response is in arbitrary order and doesn't necessarily
                    # contain a response for every id we requested
                    #
                    # So we create a dictionary for the tweetdata_list
                    # associating id's with their position in the list
                    # and a list of id's for searching
                    
                    tweet_id_dict = {}
                    tweet_id_list = []
                    
                    # find every id in tweetdata_list and its position
                    for i in range(len(tweetdata_list)):
                        id = str(tweetdata_list[i]['id'])
                        tweet_id_dict[id] = i
                        tweet_id_list.append(id)
                        
                    # pull each of the lines and its corresponding Twitter response
                    batch_process_count = 0
                    for line in bulk_list:
                        if line['id'] not in tweet_id_list:
                            skip_counter+=1
                            # check the entire line['id'] is numeric
                            if re.match(r"^\d+", line['id']):
                                # yes
                                print "%d skipped id %d"%(skip_counter, int(line['id']))
                            else:
                                # no
                                print skip_counter
                                print "line['id'] is not all numeric"
                                print line['id']                               
                            continue
                            
                        tweetdata = tweetdata_list[tweet_id_dict[line['id']]]
                        if str(line['id']) != str(tweetdata['id']):
                            skip_counter+=1
                            print "id mismatch, skipping %d"%(skip_counter)
                            print "line  id %s"%(str(line['id']))
                            print "tweet id %s"%(str(tweetdata['id']))
                            continue

                        # parse Twitter's response
                        line["file_counter"]    = file_counter
                        line["short_file_name"] = short_file_name
                        line = parse_tweet_json(line, tweetdata)
                        line['sentiment'] = find_sentiment(tweetdata, sentiment_words, sentiment_phrases)
                        
                        output_dict.append(line)
                        batch_process_count+=1
                        
                       
                    print "Rows processed: " + str(len(output_dict)) 
                    bulk_list = []
                    
        file_counter+=1
def add_twitter_data_bulk(input_filename):
    """
    reads in a *.csv file from the Coursolve Healthcare Twitter Analysis project
    and produces an output *.csv file with a number of Twitter fields added
    
    The name of the output file is the name of the input file with "_full" appended
    
    Notes:
    1. "twitter_functions.py" must be in your folder or somewhere on your path
    
    2. You must provide your own file named "twitter_credentials.py"
       (see https://apps.twitter.com/) written like this:
       
def twitter_credentials():
    api_key = " your credentials "
    api_secret = " your credentials "
    access_token_key = " your credentials "
    access_token_secret = " your credentials "
    return (api_key,api_secret,access_token_key,access_token_secret)
    
     3. You need to be aware that Twitter throttles your activity. 
        This function makes bulk calls to Twitter to try to increase 
        our throughput over add_twitter_data.py which makes one call to
        Twitter for every line
        
     4. IPython usage:
        (1) from add_twitter_data_bulk import add_twitter_data_bulk
            add_twitter_data_bulk("Tweets_BleedingDisorders.csv")
            
        (2) %run add_twitter_data_bulk.py "Tweets_BleedingDisorders.csv"
        
     5. If you have problems, I'll try to help ... [email protected]

    """
    import csv
    import json
    from twitter_functions import lookup_multiple_tweets
    from twitter_functions import parse_tweet_json
    
    output_filename   = input_filename.split(".")[0] + "_full.csv"
    step              = 95  # we're going to process in groups of "step"
    bulk_list         = []  # batch of rows from input file 
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict       = []  # list of dicts to send to output file
    
    with open(input_filename, "rb" ) as infile:
       reader     = csv.DictReader(infile)
       lines      = list(reader) # list of all lines/rows in the input file
       totallines = len(lines)   # number of lines in the input file
       print "Rows in file: " + str(totallines)
       
       # read the input file line-by-line
       # ================================
       for linenum, row in enumerate(lines):
        
           # accumulate a batch of rows from the input file
           # ==============================================
           tweet_id  = row['url'].split("/")[-1]
           row['id'] = tweet_id
           bulk_list.append(row)
           list_of_tweet_ids.append(tweet_id)
           
           # process the batch
           # =================
           if len(bulk_list) >= step or (linenum+1) >= totallines:
               
               # make a batch request to Twitter 
               result = lookup_multiple_tweets(list_of_tweet_ids)
               list_of_tweet_ids = []
               for foo in result:
                   tweetdata_list = json.loads(foo)
                   break
               # if twitter returns an error
               #    print the error
               #    break => jump to output file processing
               if 'errors' in tweetdata_list:
                   print "\nTwitter returned an error message:"
                   print "message: " + tweetdata_list["errors"][0]['message']
                   print "code:    " + str(tweetdata_list["errors"][0]['code'])
                   print "\nIf the message is 'Rate limit exceeded', see\nhttps://dev.twitter.com/docs/rate-limiting/1.1"
                   print "It basically seems to mean you have to wait 15 minutes"
                   import datetime
                   from datetime import timedelta
                   timenow    = datetime.datetime.today().strftime("%H:%M:%S")
                   timeplus15 = (datetime.datetime.today()+timedelta(minutes=15)).strftime("%H:%M:%S")
                   print " time now:           " + timenow +"\n time in 15 minutes: " + timeplus15
                   print "\nAny rows of " + input_filename + " that were processed up to this point should be in the output file\n"          
                   break

               # Twitter's response is in an arbitrary order so sort both lists by id
               bulk_list      = sorted(bulk_list,      key=lambda k: k['id'])
               tweetdata_list = sorted(tweetdata_list, key=lambda k: k['id'])
               if len(bulk_list) != len(tweetdata_list):
                   print "\nTwitter returned a different number of responses than we requested"
                   print "linenum:   " + str(linenum)
                   print "Requested: " + str(len(bulk_list))
                   print "Received:  " + str(len(tweetdata_list))
               
               for line, tweetdata in zip(bulk_list, tweetdata_list):
                   if str(tweetdata['id']) != str(line['id']):
                       print "\nmismatch in ids, skipping remaining rows in this batch"
                       print "tweetdata['id']=" + str(tweetdata['id'])
                       print "line['id']=     " + str(line['id'])
                       break

                   parse_tweet_json(line, tweetdata)
                   output_dict.append(line)
                   
               print "Rows processed: " + str(len(output_dict)) 
               bulk_list = []
               
    # create the output file
    # ======================
    if output_dict:
        f = open(output_filename,'wb')
        w = csv.DictWriter(f, delimiter=",", fieldnames=output_dict[0].keys())
        w.writeheader()
        w.writerows(output_dict)
        f.close()
        print output_filename + " has been created"
    else:
        print output_filename + " was NOT created"
Exemplo n.º 4
0
def get_twitter_json(list_of_filenames,
                     starting_at=1,
                     ending_at=0,
                     geocode=True):
    """
    1) reads in a list of fully-qualified filenames from "list_of_filenames"
        
    2) processes each row of each topsy file in the "list_of_filenames", 
       making batched calls to Twitter to retrieve the json for each tweet
           adding the data from the topsy file such as "score"
           plus unix timestamps for the topsy firstpost_date field and Twitter's created_at field
           plus coordinate and place name data for Twitter's user location field
    
    - after every 13,500 rows, or whenever there is a threshold-exceeded error
      the program goes to sleep for 15 minutes.
      
    Note: a file named twitter_credentials.py must be in the folder with the code
          see the repo: it contains your Twitter credentials
    
    Note: if geocode=True a file named mapquest_key.txt must be in the folder with the code
          get a MapQuest key here: http://developer.mapquest.com/
   
      
    Input: list_of_filenames   a text file with fully-qualified file names
           starting_at         the line number of "list_of_filenames" where processing should start
           ending_at           if 0   process all files beginning with the "starting_at" line in "list_of_filenames"
                               if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames"
           geocode             if True, batched requests are made to the MapQuest Developers API for coordinate and place name data
                               if False, these call are not made and no geo info is added
                                 
           
    Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number
        
    Usage: %run get_twitter_json.py "filename_list.csv" 2 2
                          - or -
           nohup python get_twitter_json.py "filename_list.csv" 1 0 &
    
    A message like "6 skipped id 448176144668721152" means that Twitter failed to return any data about 
    a tweet with id 448... and that this is the 6th instance of this. Fewer than 1% are skipped.
    
    To use the output file in python:
    =================================
import json
tweet_file = open("bigtweet_file002.json", "r")
for line in tweet_file:
    tweet = json.loads(str(line))
    if tweet['retweet_count'] > 100:
        print "\n\n%d %s\n%s"%(tweet['retweet_count'], tweet['user']['name'], tweet['text'])

        
    To use the output file in R:
    ============================
library(rjson)
file_path  = ("../files/bigtweet_file002.json")
tweet_list = fromJSON(sprintf("[%s]", paste(readLines(file_path),collapse=",")))

for (i in 1:length(tweet_list)){
    if (tweet_list[[i]]$retweet_count > 100){
        cat(sprintf("\n\n%d %s\n%s",tweet_list[[i]]$retweet_count, tweet_list[[i]]$user$name, tweet_list[[i]]$text))
    }
} 
## convert to twitteR structure
library(twitteR)
tweets = import_statuses(raw_data=tweet_list)

   To store in MongoDB using python:
   =================================
# create a python list of each tweet
import json
tweet_file = open("../files/bigtweet_file002.json", "r")
tweet_list = [json.loads(str(line)) for line in tweet_file]

# store the list in MongoDB
from pymongo import MongoClient
client = MongoClient()
db     = client['file002']
posts  = db.posts
#db.posts.remove( { } ) # delete if previously created

posts.insert(tweet_list)

# same example as above
for result in db.posts.find({ "retweet_count": { "$gt": 100 } }):
    print "%d %s\n%s"%(result['retweet_count'],result['user']['name'],result['text'])
    """

    import csv, json
    import re
    import time, datetime
    import sys, os
    import urllib2, urllib
    import os.path
    from twitter_functions import lookup_multiple_tweets

    # convert input parameter strings to integer
    starting_at = int(starting_at)
    ending_at = int(ending_at)
    geocode = bool(geocode)
    msg = "\nlist_of_filenames %s; starting_at %d; ending_at %d; geocode %d" % (
        list_of_filenames, starting_at, ending_at, geocode)
    logging.info(msg)

    process_start = datetime.datetime.now()
    msg = "\n=======================================\nprocess start: %s"%process_start.strftime("%c") + \
          "\n=======================================\n"
    print msg
    sys.stdout.flush()
    logging.info(msg)

    # read the list of filenames into "filename_list"
    # ===============================================
    filename_list = []
    with open(list_of_filenames, "rb") as namefile:
        csv_reader = csv.reader(namefile)
        for row in csv_reader:
            filename_list.extend(row)

    output_filename = "bigtweet_file" + "%03d" % (starting_at, ) + ".json"
    step = 100  # we're going to process in groups of "step"
    bulk_list = []  # batch of rows from input file
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict = []  # list of dicts to send to output file

    # the Twitter rate limits are documented here
    # https://dev.twitter.com/docs/rate-limiting/1.1/limits
    sleep_batch = 13500  # we sleep after this many lines processed
    sleep_batch_rows = 0  # the number of lines we've processes since the last sleep

    # MapQuest Developer API documentation: http://developer.mapquest.com/
    Geocoder_count = 0  # how many records did did we Geocode?
    if geocode:
        f = open('mapquest_key.txt', 'r')
        key = f.readline()
        f.close()
        mapq_url = 'http://www.mapquestapi.com/geocoding/v1/batch?key='
        mapq_url = mapq_url + key + '&outFormat=json&maxResults=1&callback=renderBatch'
        logging.info("MAPQUEST URL " + mapq_url)

    number_of_files = len(filename_list)  # how many files in the list
    file_counter = 1  # which one is this one
    global first_sleep
    first_sleep = True  # first time through, we write an output_file header
    invalid_json = False  # in case Twitter sends us junk
    global total_processed
    total_processed = 0  # how many rows have we processed
    skip_counter = 0  # how many rows did we skip because Twitter didn't send us info

    # read each file in and process it
    # ==================================
    for input_filename in filename_list:

        # skip the first "starting_at-1" files
        if file_counter < starting_at:
            msg = "Skipping %d of %d %s" % (file_counter, number_of_files,
                                            input_filename)
            print msg
            logging.info(msg)
            file_counter += 1
            continue

        if ending_at != 0: number_of_files = ending_at

        # find the shortened file name
        #
        # note: if your filenames do not fit my convention
        #       replace the two lines below with
        #
        #       short_file_name = input_filename
        #
        match = re.search(r"Twitter Data\\(.*)",
                          input_filename)  # Windows Google Drive
        #match = re.search("/home/ubuntu/files(.*)", input_filename) # AWS Ubuntu
        short_file_name = match.group(1)

        # stop if we're beyond "ending_at"
        if ending_at > 0:
            if file_counter > ending_at:
                msg = "Ending before %d of %d %s" % (
                    file_counter, number_of_files, input_filename)
                print msg
                logging.info(msg)
                break

        # check that the file exists
        if not os.path.isfile(input_filename):
            msg = "%s does not exist" % input_filename
            print msg
            logging.info(msg)
            file_counter += 1
            continue

        # open an input file
        with open(input_filename, "rb") as infile:
            reader = csv.DictReader(infile)
            lines = list(reader)  # list of all lines/rows in the input file
            totallines = len(lines)  # number of lines in the input file

            msg = "\n--Processing %d of %d %s rows %d" % (
                file_counter, number_of_files, short_file_name, totallines)
            print msg
            logging.info(msg)
            sys.stdout.flush()

            # read the input file line-by-line
            # ================================
            for linenum, row in enumerate(lines):

                # sleep if we're over the limit of lines processed
                sleep_batch_rows += 1
                if sleep_batch_rows > sleep_batch:
                    msg = "sleeping after %d lines of file %d of %d %s" % (
                        linenum, file_counter, number_of_files,
                        short_file_name)
                    print msg
                    logging.info(msg)
                    sleep_batch_rows = 0
                    sleep_process(output_dict, output_filename)

                # accumulate a batch of rows from the input file
                # ==============================================
                tweet_id = row['url'].split("/")[-1]
                # make sure tweet_id is actually numeric
                if re.match(r"^\d+", tweet_id):
                    # Successful match at the start of the string
                    row['id'] = tweet_id
                    bulk_list.append(row)
                    list_of_tweet_ids.append(tweet_id)
                else:
                    msg = "tweet url terminated with non-numeric in line %d" % (
                        linenum + 1)
                    print msg
                    logging.info(msg)
                    print row['url']
                    logging.info(row['url'])

                # if batch-size reached, process the batch
                if len(list_of_tweet_ids) >= step or (linenum +
                                                      1) >= totallines:

                    # make a batch request to Twitter
                    # ===============================
                    result = lookup_multiple_tweets(list_of_tweet_ids)

                    list_of_tweet_ids = []

                    for foo in result:
                        try:
                            tweetdata_list = json.loads(foo)
                            break
                        except ValueError, e:
                            msg = "\nTwitter returned invalid json"
                            print msg
                            logging.info(msg)
                            print e
                            logging.info(e)
                            msg = "after %d lines of file %d of %d %s" % (
                                linenum, file_counter, number_of_files,
                                short_file_name)
                            print msg
                            logging.info(msg)
                            bulk_list = []
                            invalid_json = True
                            break

                    if invalid_json:
                        invalid_json = False
                        break

                    # if Twitter returns an error
                    if 'errors' in tweetdata_list:
                        msg = "Twitter returned an error message:\n" + \
                              "message: " + str(tweetdata_list["errors"][0]['message']) + \
                              "\ncode:    " + str(tweetdata_list["errors"][0]['code']) + \
                              "\nafter %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                        print msg
                        logging.info(msg)
                        sleep_batch_rows = 0
                        sleep_process(tweetdata_list, output_filename)
                        bulk_list = []  # we lose the batch
                        continue

                    # Twitter's response is in arbitrary order and doesn't necessarily
                    # contain a response for every id we requested
                    #
                    # So we create a dictionary for the tweetdata_list
                    # associating id's with their position in the list
                    # and a list of id's for searching

                    tweet_id_dict = {}
                    tweet_id_list = []

                    tweet_loc_dict = {}
                    tweet_loc_list = []

                    # save every id in tweetdata_list and its position
                    for i in range(len(tweetdata_list)):
                        id = str(tweetdata_list[i]['id'])
                        tweet_id_dict[id] = i
                        tweet_id_list.append(id)

                        # save every location and its position
                        if tweetdata_list[i]['user'][
                                'location'] is not None and tweetdata_list[i][
                                    'user']['location'].strip() != "":
                            try:
                                loc = str(
                                    tweetdata_list[i]['user']['location'])
                                tweet_loc_dict[loc] = i
                                tweet_loc_list.append(loc)
                            except:
                                pass

                    # pull each of the lines and its corresponding Twitter response
                    for line in bulk_list:
                        if line['id'] not in tweet_id_list:
                            skip_counter += 1
                            # check the entire line['id'] is numeric
                            if re.match(r"^\d+", line['id']):
                                # yes
                                msg = "%d skipped id %d" % (skip_counter,
                                                            int(line['id']))
                                print msg
                                logging.info(msg)
                            else:
                                # no
                                print skip_counter
                                logging.info(skip_counter)
                                msg = "line['id'] is not all numeric"
                                print msg
                                logging.info(msg)
                                print line['id']
                                logging.info(line['id'])
                            continue

                        tweetdata = tweetdata_list[tweet_id_dict[line['id']]]
                        if str(line['id']) != str(tweetdata['id']):
                            skip_counter += 1
                            msg = "id mismatch, skipping %d"%(skip_counter) + \
                                   "\nline  id %s"%(str(line['id'])) + \
                                   "\ntweet id %s"%(str(tweetdata['id']))
                            print msg
                            logging.info(msg)
                            continue

                        # ===========================================
                        # add Topsy fields to Twitter's json response
                        # ===========================================

                        # add a timestamp for 'created_at'
                        # time.ctime(tweet['timestamp']) # will decode this field
                        tweetdata_list[tweet_id_dict[line['id']]]['timestamp'] = \
                            time.mktime(datetime.datetime.strptime(tweetdata_list[tweet_id_dict[line['id']]]['created_at'], '%a %b %d %H:%M:%S +0000 %Y').timetuple())

                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'] = {}
                        # add a timestamp for topsy's 'firstpost_date'
                        try:
                            tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \
                                time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%Y").timetuple())
                        except:
                            try:
                                tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \
                                    time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%y").timetuple())
                            except:
                                tweetdata_list[tweet_id_dict[
                                    line['id']]]['topsy']['timestamp'] = ""

                        # add the topsy csv file fields to the Twitter json
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'][
                            'firstpost_date'] = line['firstpost_date']
                        tweetdata_list[tweet_id_dict[
                            line['id']]]['topsy']['score'] = float(
                                line['score'])
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'][
                            'trackback_author_nick'] = line[
                                'trackback_author_nick']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'][
                            'trackback_author_url'] = line[
                                'trackback_author_url']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'][
                            'trackback_permalink'] = line[
                                'trackback_permalink']
                        tweetdata_list[tweet_id_dict[
                            line['id']]]['topsy']['url'] = line['url']

                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'][
                            'file_counter'] = file_counter
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'][
                            'short_file_name'] = short_file_name

                    # =======================================
                    # add geo data to Twitter's json response
                    # =======================================

                    if geocode:
                        # give everybody a blank
                        for idx in range(len(tweetdata_list)):
                            tweetdata_list[idx]["user"][
                                "location_geoinfo"] = {}

                        # create a list of locations to send to MapQuest
                        loc_url = ''
                        for tweet_loc in tweet_loc_list:
                            loc_url = loc_url + '&location=' + tweet_loc
                        # send 'em
                        urllib.urlretrieve(mapq_url + loc_url, "batch.json")
                        # get the answer
                        batch = open("batch.json", "r")
                        lines = batch.readlines()
                        batch.close()

                        # what they send back has superfluous stuff at the front and back ends
                        match = []
                        if lines:
                            match = re.search(r"renderBatch\((\{.*\})",
                                              lines[0])
                        if match:
                            result = match.group(1)
                            try:
                                locs = json.loads(result)

                                # step through MapQuest's response and add data to Twitter's json response
                                for results in locs['results']:
                                    if results['providedLocation'][
                                            'location'] in tweet_loc_dict.keys(
                                            ):
                                        dict_loc = tweet_loc_dict[results[
                                            'providedLocation']['location']]
                                        tweetdata_list[dict_loc]["user"][
                                            "location_geoinfo"] = results[
                                                'locations'][0]
                                        if tweetdata_list[dict_loc]["user"][
                                                "location_geoinfo"]:
                                            Geocoder_count += 1
                                    else:
                                        logging.warning(
                                            "\nMAPQUEST KEY MISMATCH")
                                        logging.warning(tweet_loc_dict.keys())
                                        logging.warning(results)
                            except:
                                msg = "MapQuest sent invalid json"
                                print msg
                                logging.warning(msg)
                                logging.warning(lines)
                        else:
                            msg = "MapQuest sent empty response"
                            print msg
                            logging.warning(msg)
                            logging.warning(lines)

                    # process the json file and start over with a new batch from Twitter
                    # ==================================================================
                    process_output_file(tweetdata_list, output_filename)
                    bulk_list = []

        file_counter += 1
def get_twitter_json(list_of_filenames, starting_at=1, ending_at=0, geocode=True):
    """
    1) reads in a list of fully-qualified filenames from "list_of_filenames"
        
    2) processes each row of each topsy file in the "list_of_filenames", 
       making batched calls to Twitter to retrieve the json for each tweet
           adding the data from the topsy file such as "score"
           plus unix timestamps for the topsy firstpost_date field and Twitter's created_at field
           plus coordinate and place name data for Twitter's user location field
    
    - after every 13,500 rows, or whenever there is a threshold-exceeded error
      the program goes to sleep for 15 minutes.
      
      On my quad i7 4G Windows 7 64-bit machine, with my Comcast Internet connection,
      I process about 1,700 tweets per minute. The sleep time for the Twitter threshold 
      obviously increases the elapsed time, yielding roughly 
      8 + 15 = 23 minutes elapsed time per 13,500 tweets.
      For 2,500,000 tweets, that's 2,500,000/13,500*23 = 4259 minutes/71 hours/3 days
      elapsed time for all of the tweets in the project.
    
    Note: a file named twitter_credentials.py must be in the folder with the code
          see the repo: it contains your Twitter credentials
    
    Note: if geocode=True a file named mapquest_key.txt must be in the folder with the code
          get a MapQuest key here: http://developer.mapquest.com/
          
    Note: if ["user"] is embedded in ["retweeted_status"] I do not get the location info
          This, plus problems like blank or incomprehensible ['location'] fields 
          puts the geo-tagging rate at 75%.
      
    Input: list_of_filenames   a text file with fully-qualified file names
           starting_at         the line number of "list_of_filenames" where processing should start
           ending_at           if 0   process all files beginning with the "starting_at" line in "list_of_filenames"
                               if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames"
           geocode             if True, batched requests are made to the MapQuest Developers API for coordinate and place name data
                               if False, these call are not made and no geo info is added
                                 
           
    Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number
        
    Usage: %run get_twitter_json.py "filename_list.csv" 2 2
    
    A message like "6 skipped id 448176144668721152" means that Twitter failed to return any data about 
    a tweet with id 448... and that this is the 6th instance of this. Fewer than 1% are skipped.
    
    To use the output file in python:
    =================================
import json
tweet_file = open("../files/bigtweet_file002.json", "r")
for line in tweet_file:
    tweet = json.loads(str(line))
    if tweet['retweet_count'] > 100:
        print "\n\n%d %s\n%s"%(tweet['retweet_count'], tweet['user']['name'], tweet['text'])

        
    To use the output file in R:
    ============================
library(rjson)
file_path  = ("../files/bigtweet_file002.json")
tweet_list = fromJSON(sprintf("[%s]", paste(readLines(file_path),collapse=",")))

for (i in 1:length(tweet_list)){
    if (tweet_list[[i]]$retweet_count > 100){
        cat(sprintf("\n\n%d %s\n%s",tweet_list[[i]]$retweet_count, tweet_list[[i]]$user$name, tweet_list[[i]]$text))
    }
} 
## convert to twitteR structure
library(twitteR)
tweets = import_statuses(raw_data=tweet_list)

   To store in MongoDB using python:
   =================================
# create a python list of each tweet
import json
tweet_file = open("../files/bigtweet_file002.json", "r")
tweet_list = [json.loads(str(line)) for line in tweet_file]

# store the list in MongoDB
from pymongo import MongoClient
client = MongoClient()
db     = client['file002']
posts  = db.posts
#db.posts.remove( { } ) # delete if previously created

posts.insert(tweet_list)

# same example as above
for result in db.posts.find({ "retweet_count": { "$gt": 100 } }):
    print "%d %s\n%s"%(result['retweet_count'],result['user']['name'],result['text'])
    """
    import csv, json
    import re
    import time, datetime
    import sys, os
    import urllib2,urllib
    import os.path
    from twitter_functions import lookup_multiple_tweets
    
    # convert input parameter strings to integer
    starting_at = int(starting_at) 
    ending_at   = int(ending_at)
    
    process_start = datetime.datetime.now()
    print "\n================================"
    print "process start: %s"%process_start.strftime("%c")
    print "================================\n"
    sys.stdout.flush()
    
    # read the list of filenames into "filename_list"
    # ===============================================
    filename_list = []
    with open(list_of_filenames, "rb") as namefile:
        csv_reader = csv.reader(namefile)
        for row in csv_reader:
            filename_list.extend(row)
    
    output_filename   = "bigtweet_file" + "%03d"%(starting_at,) + ".json"
    step              = 100 # we're going to process in groups of "step"
    bulk_list         = []  # batch of rows from input file
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict       = []  # list of dicts to send to output file
    
    # the Twitter rate limits are documented here
    # https://dev.twitter.com/docs/rate-limiting/1.1/limits
    sleep_batch       = 13500 # we sleep after this many lines processed
    sleep_batch_rows  = 0     # the number of lines we've processes since the last sleep
    
    # MapQuest Developer API documentation: http://developer.mapquest.com/
    Geocoder_count    = 0     # how many records did did we Geocode?
    if geocode:
        f = open('mapquest_key.txt','r')
        key = f.readline()
        f.close()
        mapq_url = 'http://www.mapquestapi.com/geocoding/v1/batch?key='
        mapq_url = mapq_url + key + '&outFormat=json&maxResults=1&callback=renderBatch'

    
    number_of_files   = len(filename_list) # how many files in the list
    file_counter      = 1                  # which one is this one
    global first_sleep
    first_sleep       = True               # first time through, we write an output_file header
    invalid_json      = False              # in case Twitter sends us junk
    global total_processed
    total_processed   = 0                  # how many rows have we processed
    skip_counter      = 0                  # how many rows did we skip because Twitter didn't send us info
    
    # read each file in and process it
    # ==================================
    for input_filename in filename_list:
        
        # skip the first "starting_at-1" files
        if file_counter < starting_at:
            print "Skipping %d of %d %s"%(file_counter, number_of_files, input_filename)
            file_counter+=1
            continue  
            
        if ending_at != 0: number_of_files = ending_at
            
        # find the shortened file name
        #
        # note: if your filenames do not fit my convention
        #       replace the two lines below with
        #
        #       short_file_name = input_filename
        #
        match = re.search(r"Twitter Data\\(.*)", input_filename) 
        short_file_name = match.group(1)  

        # stop if we're beyond "ending_at"
        if ending_at > 0:
            if file_counter > ending_at:
                print "Ending before %d of %d %s"%(file_counter, number_of_files, input_filename)
                break
                
        # check that the file exists
        if not os.path.isfile(input_filename):
            print "%s does not exist"%input_filename
            file_counter+=1
            continue
        
        # open an input file
        with open(input_filename, "rb" ) as infile:
            reader     = csv.DictReader(infile)
            lines      = list(reader) # list of all lines/rows in the input file
            totallines = len(lines)   # number of lines in the input file
            
            print "\n--Processing %d of %d %s rows %d"%(file_counter, number_of_files, short_file_name,totallines)
            sys.stdout.flush()
            
            # read the input file line-by-line
            # ================================
            for linenum, row in enumerate(lines):
                
                # sleep if we're over the limit of lines processed
                sleep_batch_rows+=1
                if sleep_batch_rows > sleep_batch:
                    print "sleeping after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                    sleep_batch_rows = 0
                    sleep_process(output_dict, output_filename)
                    
                # accumulate a batch of rows from the input file
                # ==============================================
                tweet_id  = row['url'].split("/")[-1]
                # make sure tweet_id is actually numeric
                if re.match(r"^\d+", tweet_id):
                    # Successful match at the start of the string
                    row['id'] = tweet_id
                    bulk_list.append(row)
                    list_of_tweet_ids.append(tweet_id)
                else:
                    print "tweet url terminated with non-numeric in line %d"%(linenum+1)
                    print row['url']
                
                # if batch-size reached, process the batch
                if len(list_of_tweet_ids) >= step or (linenum+1) >= totallines:
                   
                    # make a batch request to Twitter 
                    # ===============================
                    result = lookup_multiple_tweets(list_of_tweet_ids)
                        
                    list_of_tweet_ids = []
                    
                    for foo in result:
                        try:
                            tweetdata_list = json.loads(foo)
                            break
                        except ValueError, e:
                            print "\nTwitter returned invalid json"
                            print e
                            print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                            bulk_list = []
                            invalid_json = True
                            break
                            
                    if invalid_json:
                        invalid_json = False
                        break
                        
                    # if Twitter returns an error
                    if 'errors' in tweetdata_list:
                        print "Twitter returned an error message:"
                        print "message: " + str(tweetdata_list["errors"][0]['message'])
                        print "code:    " + str(tweetdata_list["errors"][0]['code'])
                        print "after %d lines of file %d of %d %s"%(linenum, file_counter, number_of_files, short_file_name)
                        sleep_batch_rows = 0
                        sleep_process(tweetdata_list, output_filename)
                        bulk_list = [] # we lose the batch
                        continue
                    
                    # Twitter's response is in arbitrary order and doesn't necessarily
                    # contain a response for every id we requested
                    #
                    # So we create a dictionary for the tweetdata_list
                    # associating id's with their position in the list
                    # and a list of id's for searching
                    
                    tweet_id_dict = {}
                    tweet_id_list = []
                    
                    tweet_loc_dict = {}
                    tweet_loc_list = []
                    
                    # save every id in tweetdata_list and its position
                    for i in range(len(tweetdata_list)):
                        id = str(tweetdata_list[i]['id'])
                        tweet_id_dict[id] = i
                        tweet_id_list.append(id)
                        
                        # save every location and its position
                        if tweetdata_list[i]['user']['location'] is not None and tweetdata_list[i]['user']['location'].strip() != "":
                            try:
                                loc = str(tweetdata_list[i]['user']['location'])
                                tweet_loc_dict[loc] = i
                                tweet_loc_list.append(loc)
                            except:
                                pass
                        
                    # pull each of the lines and its corresponding Twitter response
                    for line in bulk_list:
                        if line['id'] not in tweet_id_list:
                            skip_counter+=1
                            # check the entire line['id'] is numeric
                            if re.match(r"^\d+", line['id']):
                                # yes
                                print "%d skipped id %d"%(skip_counter, int(line['id']))
                            else:
                                # no
                                print skip_counter
                                print "line['id'] is not all numeric"
                                print line['id']                               
                            continue
                            
                        tweetdata = tweetdata_list[tweet_id_dict[line['id']]]
                        if str(line['id']) != str(tweetdata['id']):
                            skip_counter+=1
                            print "id mismatch, skipping %d"%(skip_counter)
                            print "line  id %s"%(str(line['id']))
                            print "tweet id %s"%(str(tweetdata['id']))
                            continue
                            
                        # ===========================================
                        # add Topsy fields to Twitter's json response
                        # ===========================================
                            
                        # add a timestamp for 'created_at'
                        # time.ctime(tweet['timestamp']) # will decode this field
                        tweetdata_list[tweet_id_dict[line['id']]]['timestamp'] = \
                            time.mktime(datetime.datetime.strptime(tweetdata_list[tweet_id_dict[line['id']]]['created_at'], '%a %b %d %H:%M:%S +0000 %Y').timetuple())
                            
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy'] = {}
                        # add a timestamp for topsy's 'firstpost_date' 
                        try: 
                            tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \
                                time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%Y").timetuple())
                        except:
                            try:
                                tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = \
                                    time.mktime(datetime.datetime.strptime(line['firstpost_date'], "%m/%d/%y").timetuple())
                            except:
                                tweetdata_list[tweet_id_dict[line['id']]]['topsy']['timestamp'] = ""
                                
                        # add the topsy csv file fields to the Twitter json
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['firstpost_date']        = line['firstpost_date']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['score']                 = float(line['score'])
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_nick'] = line['trackback_author_nick']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_author_url']  = line['trackback_author_url']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['trackback_permalink']   = line['trackback_permalink']
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['url']                   = line['url']
                        
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['file_counter']          = file_counter
                        tweetdata_list[tweet_id_dict[line['id']]]['topsy']['short_file_name']       = short_file_name
                        
                    # =======================================
                    # add geo data to Twitter's json response
                    # =======================================
                    
                    if geocode:
                        # give everybody a blank 
                        for idx in range(len(tweetdata_list)):
                            tweetdata_list[idx]["user"]["location_geoinfo"] = {}
                            
                        # create a list of locations to send to MapQuest
                        loc_url = ''
                        for tweet_loc in tweet_loc_list:
                            loc_url = loc_url + '&location=' + tweet_loc
                        # send 'em
                        urllib.urlretrieve (mapq_url + loc_url, "batch.json")
                        # get the answer
                        batch = open("batch.json","r")
                        lines = batch.readlines()
                        batch.close()
                        
                        # what they send back has superfluous stuff at the front and back ends
                        try:
                            locs = json.loads(lines[0][12:-1])
                            
                            # step through MapQuest's response and add data to Twitter's json response
                            for results in locs['results']:
                                if results['providedLocation']['location'] in tweet_loc_dict.keys():
                                    dict_loc = tweet_loc_dict[results['providedLocation']['location']]
                                    tweetdata_list[dict_loc]["user"]["location_geoinfo"] = results['locations'][0]
                                    Geocoder_count += 1
                        except:
                            print "MapQuest sent invalid json"
                        
                                    
                    # process the json file and start over with a new batch from Twitter
                    # ==================================================================
                    process_output_file(tweetdata_list, output_filename)
                    bulk_list = []
                  
        file_counter+=1
def add_twitter_data_bulk(input_filename):
    """
    reads in a *.csv file from the Coursolve Healthcare Twitter Analysis project
    and produces an output *.csv file with a number of Twitter fields added
    
    The name of the output file is the name of the input file with "_full" appended
    
    Notes:
    1. "twitter_functions.py" must be in your folder or somewhere on your path
    
    2. You must provide your own file named "twitter_credentials.py"
       (see https://apps.twitter.com/) written like this:
       
def twitter_credentials():
    api_key = " your credentials "
    api_secret = " your credentials "
    access_token_key = " your credentials "
    access_token_secret = " your credentials "
    return (api_key,api_secret,access_token_key,access_token_secret)
    
     3. You need to be aware that Twitter throttles your activity. 
        This function makes bulk calls to Twitter to try to increase 
        our throughput over add_twitter_data.py which makes one call to
        Twitter for every line
        
     4. IPython usage:
        (1) from add_twitter_data_bulk import add_twitter_data_bulk
            add_twitter_data_bulk("Tweets_BleedingDisorders.csv")
            
        (2) %run add_twitter_data_bulk.py "Tweets_BleedingDisorders.csv"
        
     5. If you have problems, I'll try to help ... [email protected]

    """
    import csv
    import json
    from twitter_functions import lookup_multiple_tweets
    from twitter_functions import parse_tweet_json

    output_filename = input_filename.split(".")[0] + "_full.csv"
    step = 95  # we're going to process in groups of "step"
    bulk_list = []  # batch of rows from input file
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict = []  # list of dicts to send to output file

    with open(input_filename, "rb") as infile:
        reader = csv.DictReader(infile)
        lines = list(reader)  # list of all lines/rows in the input file
        totallines = len(lines)  # number of lines in the input file
        print "Rows in file: " + str(totallines)

        # read the input file line-by-line
        # ================================
        for linenum, row in enumerate(lines):

            # accumulate a batch of rows from the input file
            # ==============================================
            tweet_id = row['url'].split("/")[-1]
            row['id'] = tweet_id
            bulk_list.append(row)
            list_of_tweet_ids.append(tweet_id)

            # process the batch
            # =================
            if len(bulk_list) >= step or (linenum + 1) >= totallines:

                # make a batch request to Twitter
                result = lookup_multiple_tweets(list_of_tweet_ids)
                list_of_tweet_ids = []
                for foo in result:
                    tweetdata_list = json.loads(foo)
                    break
                # if twitter returns an error
                #    print the error
                #    break => jump to output file processing
                if 'errors' in tweetdata_list:
                    print "\nTwitter returned an error message:"
                    print "message: " + tweetdata_list["errors"][0]['message']
                    print "code:    " + str(
                        tweetdata_list["errors"][0]['code'])
                    print "\nIf the message is 'Rate limit exceeded', see\nhttps://dev.twitter.com/docs/rate-limiting/1.1"
                    print "It basically seems to mean you have to wait 15 minutes"
                    import datetime
                    from datetime import timedelta
                    timenow = datetime.datetime.today().strftime("%H:%M:%S")
                    timeplus15 = (datetime.datetime.today() +
                                  timedelta(minutes=15)).strftime("%H:%M:%S")
                    print " time now:           " + timenow + "\n time in 15 minutes: " + timeplus15
                    print "\nAny rows of " + input_filename + " that were processed up to this point should be in the output file\n"
                    break

                # Twitter's response is in an arbitrary order so sort both lists by id
                bulk_list = sorted(bulk_list, key=lambda k: k['id'])
                tweetdata_list = sorted(tweetdata_list, key=lambda k: k['id'])
                if len(bulk_list) != len(tweetdata_list):
                    print "\nTwitter returned a different number of responses than we requested"
                    print "linenum:   " + str(linenum)
                    print "Requested: " + str(len(bulk_list))
                    print "Received:  " + str(len(tweetdata_list))

                for line, tweetdata in zip(bulk_list, tweetdata_list):
                    if str(tweetdata['id']) != str(line['id']):
                        print "\nmismatch in ids, skipping remaining rows in this batch"
                        print "tweetdata['id']=" + str(tweetdata['id'])
                        print "line['id']=     " + str(line['id'])
                        break

                    parse_tweet_json(line, tweetdata)
                    output_dict.append(line)

                print "Rows processed: " + str(len(output_dict))
                bulk_list = []

    # create the output file
    # ======================
    if output_dict:
        f = open(output_filename, 'wb')
        w = csv.DictWriter(f, delimiter=",", fieldnames=output_dict[0].keys())
        w.writeheader()
        w.writerows(output_dict)
        f.close()
        print output_filename + " has been created"
    else:
        print output_filename + " was NOT created"
def create_bulkfile(list_of_filenames, starting_at=1, ending_at=0):
    """
    - reads in a list of fully-qualified filenames from "list_of_filenames"
    
        I'm expecting file names to have the Windows Google Drive structure, for example
        ... Twitter Data\June\Cardiovasucular\Tweets_AFib.csv  
        
        the code is commented with a simple solution you can implement to allow you to have
        any arbitrary fully-qualified filename, for any operating system
        
    - processes each row of each file in the file list, 
      making batched calls to Twitter to retrieve the data for each tweet
    
    - after every 13,500 rows, or whenever there is a threshold-exceeded error
      the output_file is written and the program goes to sleep for 15 minutes.
      
    Note: AFINN-111.txt must be in the same folder
          you can use it as is or include your own n-grams
          the 'sentiment' field is the sum of the scores of all the n-grams found
    
    Input: list_of_filenames   a text file with fully-qualified file names
           starting_at         the line number of "list_of_filenames" where processing should start
           ending_at           if 0   process all files beginning with the "starting_at" line in "list_of_filenames"
                               if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames"
           
    Output: a csv file named "bigtweet_filexxx.csv", where xxx is the "starting_at" number
        
    Usage: %run create_bulkfile.py "filename_list.csv" 1 0
    
    A message like "263 skipped id 463811853097787392" indicates that Twitter did not return data
    for a tweet with the id of 463811853097787392 and this is the 263rd instance of this. 
    As a result of this and other less-common errors the output file will have fewer rows than 
    the total rows in the input files.
    """
    import csv
    import json
    import re
    import time
    import sys
    import six
    import datetime
    from twitter_functions import lookup_multiple_tweets
    from twitter_functions import parse_AFINN

    # convert input parameter strings to integer
    starting_at = int(starting_at)
    ending_at = int(ending_at)

    process_start = datetime.datetime.now()
    print "\n================================"
    print "process start: %s" % process_start.strftime("%c")
    print "================================\n"

    # read the list of filenames into "filename_list"
    # ===============================================
    filename_list = []
    with open(list_of_filenames, "rb") as namefile:
        csv_reader = csv.reader(namefile)
        for row in csv_reader:
            filename_list.extend(row)

    output_filename = "bigtweet_file" + "%03d" % (starting_at, ) + ".csv"
    step = 100  # we're going to process in groups of "step"
    bulk_list = []  # batch of rows from input file
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict = []  # list of dicts to send to output file

    # the Twitter rate limits are documented here
    # https://dev.twitter.com/docs/rate-limiting/1.1/limits
    sleep_batch = 13500  # we sleep after this many lines processed
    sleep_batch_rows = 0  # the number of lines we've processes since the last sleep

    number_of_files = len(filename_list)  # how many files in the list
    file_counter = 1  # which one is this one
    first_sleep = True  # first time through, we write an output_file header
    invalid_json = False  # in case Twitter sends us junk
    skip_counter = 0  # how many rows did we skip because Twitter didn't send us info

    # read in the n-grams for sentiment processing
    sentiment_words, sentiment_phrases = parse_AFINN("AFINN-111.txt")

    # read each file in and process it
    # ==================================
    for input_filename in filename_list:

        # skip the first "starting_at-1" files
        if file_counter < starting_at:
            print "Skipping %d of %d %s" % (file_counter, number_of_files,
                                            input_filename)
            file_counter += 1
            continue

        if ending_at != 0: number_of_files = ending_at

        # find the shortened file name
        #
        # note: if your filenames do not fit my convention
        #       replace the two lines below with
        #
        #       short_file_name = input_filename
        #
        match = re.search(r"Twitter Data\\(.*)", input_filename)
        short_file_name = match.group(1)

        # stop if we're beyond "ending_at"
        if ending_at > 0:
            if file_counter > ending_at:
                print "Ending before %d of %d %s" % (
                    file_counter, number_of_files, input_filename)
                break

        # open an input file
        with open(input_filename, "rb") as infile:
            reader = csv.DictReader(infile)
            lines = list(reader)  # list of all lines/rows in the input file
            totallines = len(lines)  # number of lines in the input file

            print "\n--Processing %d of %d %s rows %d" % (
                file_counter, number_of_files, short_file_name, totallines)

            # read the input file line-by-line
            # ================================
            for linenum, row in enumerate(lines):

                # sleep if we're over the limit of lines processed
                sleep_batch_rows += 1
                if sleep_batch_rows > sleep_batch:
                    print "sleeping after %d lines of file %d of %d %s" % (
                        linenum, file_counter, number_of_files,
                        short_file_name)
                    sleep_batch_rows = 0
                    sleep_process(output_dict, output_filename, first_sleep)

                # accumulate a batch of rows from the input file
                # ==============================================
                tweet_id = row['url'].split("/")[-1]
                # make sure tweet_id is actually numeric
                if re.match(r"^\d+", tweet_id):
                    # Successful match at the start of the string
                    row['id'] = tweet_id
                    bulk_list.append(row)
                    list_of_tweet_ids.append(tweet_id)
                else:
                    print "tweet url terminated with non-numeric in line %d" % (
                        linenum + 1)
                    print row['url']

                # if batch-size reached, process the batch
                if len(bulk_list) >= step or (linenum + 1) >= totallines:

                    # make a batch request to Twitter
                    # ===============================
                    while True:
                        result = lookup_multiple_tweets(list_of_tweet_ids)
                        if result: break
                        print "\nTwitter returned an empty result\n"
                        time.sleep(1)

                    list_of_tweet_ids = []
                    for foo in result:
                        try:
                            tweetdata_list = json.loads(foo)
                            break
                        except ValueError, e:
                            print "\nTwitter returned invalid json"
                            print e
                            print "after %d lines of file %d of %d %s" % (
                                linenum, file_counter, number_of_files,
                                short_file_name)
                            bulk_list = []
                            invalid_json = True
                            break

                    if invalid_json:
                        invalid_json = False
                        break

                    # if Twitter returns an error
                    #
                    # better process
                    # try:
                    #     statuses = api.GetUserTimeline(u.id)
                    #     print [s.text for s in statuses]
                    # except TwitterError, t:
                    #     print t
                    if 'errors' in tweetdata_list:
                        print "Twitter returned an error message:"
                        print "message: " + str(
                            tweetdata_list["errors"][0]['message'])
                        print "code:    " + str(
                            tweetdata_list["errors"][0]['code'])
                        print "after %d lines of file %d of %d %s" % (
                            linenum, file_counter, number_of_files,
                            short_file_name)
                        sleep_batch_rows = 0
                        sleep_process(output_dict, output_filename,
                                      first_sleep)
                        bulk_list = []  # we lose the batch
                        continue

                    # Twitter's response is in arbitrary order and doesn't necessarily
                    # contain a response for every id we requested
                    #
                    # So we create a dictionary for the tweetdata_list
                    # associating id's with their position in the list
                    # and a list of id's for searching

                    tweet_id_dict = {}
                    tweet_id_list = []

                    # find every id in tweetdata_list and its position
                    for i in range(len(tweetdata_list)):
                        id = str(tweetdata_list[i]['id'])
                        tweet_id_dict[id] = i
                        tweet_id_list.append(id)

                    # pull each of the lines and its corresponding Twitter response
                    batch_process_count = 0
                    for line in bulk_list:
                        if line['id'] not in tweet_id_list:
                            skip_counter += 1
                            # check the entire line['id'] is numeric
                            if re.match(r"^\d+", line['id']):
                                # yes
                                print "%d skipped id %d" % (skip_counter,
                                                            int(line['id']))
                            else:
                                # no
                                print skip_counter
                                print "line['id'] is not all numeric"
                                print line['id']
                            continue

                        tweetdata = tweetdata_list[tweet_id_dict[line['id']]]
                        if str(line['id']) != str(tweetdata['id']):
                            skip_counter += 1
                            print "id mismatch, skipping %d" % (skip_counter)
                            print "line  id %s" % (str(line['id']))
                            print "tweet id %s" % (str(tweetdata['id']))
                            continue

                        # parse Twitter's response
                        line["file_counter"] = file_counter
                        line["short_file_name"] = short_file_name
                        line = parse_tweet_json(line, tweetdata)
                        line['sentiment'] = find_sentiment(
                            tweetdata, sentiment_words, sentiment_phrases)

                        output_dict.append(line)
                        batch_process_count += 1

                    print "Rows processed: " + str(len(output_dict))
                    bulk_list = []

        file_counter += 1
Exemplo n.º 8
0
def create_jsonfile(list_of_filenames, starting_at=1, ending_at=0):
    """
    - reads in a list of fully-qualified filenames from "list_of_filenames"
        
    - processes each row of each file in the file list, 
      making batched calls to Twitter to retrieve the data for each tweet
    
    - after every 13,500 rows, or whenever there is a threshold-exceeded error
      the output_file is written and the program goes to sleep for 15 minutes.
      
    Input: list_of_filenames   a text file with fully-qualified file names
           starting_at         the line number of "list_of_filenames" where processing should start
           ending_at           if 0   process all files beginning with the "starting_at" line in "list_of_filenames"
                               if > 0 process the files from line "starting_at" to line "ending_at" in "list_of_filenames"
           
    Output: a text file named "bigtweet_filexxx.json", where xxx is the "starting_at" number
        
    Usage: %run create_jsonfile.py "filename_list.csv" 1 0
    
    To use the output file in python:
    =================================
import json
tweet_file = open("../files/bigtweet_file003.json", "r")
for line in tweet_file:
    tweet = json.loads(str(line))
    if tweet['retweet_count'] > 100:
        print "\n\n%d %s\n%s"%(tweet['retweet_count'], tweet['user']['name'], tweet['text'])

        
    To use the output file in R:
    ============================
library(rjson)
file_path  = ("../files/bigtweet_file003.json")
tweet_list = fromJSON(sprintf("[%s]", paste(readLines(file_path),collapse=",")))

for (i in 1:length(tweet_list)){
    if (tweet_list[[i]]$retweet_count > 100){
        cat(sprintf("\n\n%d %s\n%s",tweet_list[[i]]$retweet_count, tweet_list[[i]]$user$name, tweet_list[[i]]$text))
    }
} 
## convert to twitteR structure
library(twitteR)
tweets = import_statuses(raw_data=tweet_list)

   To store in MongoDB using python:
   =================================
# create a python list of each tweet
import json
tweet_file = open("../files/bigtweet_file003.json", "r")
tweet_list = [json.loads(str(line)) for line in tweet_file]

# store the list in MongoDB
from pymongo import MongoClient
client = MongoClient()
db     = client['file003']
posts  = db.posts
#db.posts.remove( { } ) # delete if previously created

posts.insert(tweet_list)

# same example as above
for result in db.posts.find({ "retweet_count": { "$gt": 100 } }):
    print "%d %s\n%s"%(result['retweet_count'],result['user']['name'],result['text'])
    """
    import csv
    import json
    import re
    import time
    import sys
    import datetime
    from twitter_functions import lookup_multiple_tweets

    # convert input parameter strings to integer
    starting_at = int(starting_at)
    ending_at = int(ending_at)

    process_start = datetime.datetime.now()
    print "\n================================"
    print "process start: %s" % process_start.strftime("%c")
    print "================================\n"

    # read the list of filenames into "filename_list"
    # ===============================================
    filename_list = []
    with open(list_of_filenames, "rb") as namefile:
        csv_reader = csv.reader(namefile)
        for row in csv_reader:
            filename_list.extend(row)

    output_filename = "bigtweet_file" + "%03d" % (starting_at, ) + ".json"
    step = 100  # we're going to process in groups of "step"
    list_of_tweet_ids = []  # tweet ids of these rows
    output_dict = []  # list of dicts to send to output file

    # the Twitter rate limits are documented here
    # https://dev.twitter.com/docs/rate-limiting/1.1/limits
    sleep_batch = 13500  # we sleep after this many lines processed
    sleep_batch_rows = 0  # the number of lines we've processes since the last sleep

    number_of_files = len(filename_list)  # how many files in the list
    file_counter = 1  # which one is this one
    global first_sleep
    first_sleep = True  # first time through, we write an output_file header
    invalid_json = False  # in case Twitter sends us junk
    global total_processed
    total_processed = 0  # how many rows have we processed

    # read each file in and process it
    # ==================================
    for input_filename in filename_list:

        # skip the first "starting_at-1" files
        if file_counter < starting_at:
            print "Skipping %d of %d %s" % (file_counter, number_of_files,
                                            input_filename)
            file_counter += 1
            continue

        if ending_at != 0: number_of_files = ending_at

        # find the shortened file name
        #
        # note: if your filenames do not fit my convention
        #       replace the two lines below with
        #
        #       short_file_name = input_filename
        #
        match = re.search(r"Twitter Data\\(.*)", input_filename)
        short_file_name = match.group(1)

        # stop if we're beyond "ending_at"
        if ending_at > 0:
            if file_counter > ending_at:
                print "Ending before %d of %d %s" % (
                    file_counter, number_of_files, input_filename)
                break

        # open an input file
        with open(input_filename, "rb") as infile:
            reader = csv.DictReader(infile)
            lines = list(reader)  # list of all lines/rows in the input file
            totallines = len(lines)  # number of lines in the input file

            print "\n--Processing %d of %d %s rows %d" % (
                file_counter, number_of_files, short_file_name, totallines)

            # read the input file line-by-line
            # ================================
            for linenum, row in enumerate(lines):

                # sleep if we're over the limit of lines processed
                sleep_batch_rows += 1
                if sleep_batch_rows > sleep_batch:
                    print "sleeping after %d lines of file %d of %d %s" % (
                        linenum, file_counter, number_of_files,
                        short_file_name)
                    sleep_batch_rows = 0
                    sleep_process(output_dict, output_filename)

                # accumulate a batch of rows from the input file
                # ==============================================
                tweet_id = row['url'].split("/")[-1]
                # make sure tweet_id is actually numeric
                if re.match(r"^\d+", tweet_id):
                    # Successful match at the start of the string
                    row['id'] = tweet_id
                    list_of_tweet_ids.append(tweet_id)
                else:
                    print "tweet url terminated with non-numeric in line %d" % (
                        linenum + 1)
                    print row['url']

                # if batch-size reached, process the batch
                if len(list_of_tweet_ids) >= step or (linenum +
                                                      1) >= totallines:

                    # make a batch request to Twitter
                    # ===============================
                    result = lookup_multiple_tweets(list_of_tweet_ids)

                    list_of_tweet_ids = []

                    for foo in result:
                        try:
                            tweetdata_list = json.loads(foo)
                            break
                        except ValueError, e:
                            print "\nTwitter returned invalid json"
                            print e
                            print "after %d lines of file %d of %d %s" % (
                                linenum, file_counter, number_of_files,
                                short_file_name)
                            invalid_json = True
                            break

                    if invalid_json:
                        invalid_json = False
                        break

                    # if Twitter returns an error
                    #
                    # better process
                    # try:
                    #     statuses = api.GetUserTimeline(u.id)
                    #     print [s.text for s in statuses]
                    # except TwitterError, t:
                    #     print t
                    if 'errors' in tweetdata_list:
                        print "Twitter returned an error message:"
                        print "message: " + str(
                            tweetdata_list["errors"][0]['message'])
                        print "code:    " + str(
                            tweetdata_list["errors"][0]['code'])
                        print "after %d lines of file %d of %d %s" % (
                            linenum, file_counter, number_of_files,
                            short_file_name)
                        sleep_batch_rows = 0
                        sleep_process(tweetdata_list, output_filename)
                        continue

                    process_output_file(tweetdata_list, output_filename)

        file_counter += 1