def main(): import json import datetime import sys, os, io from geolocate_reverse import geolocate_reverse process_start = datetime.datetime.now() msg = "\n=============================================" + \ "\nupdate augmented geo data " + \ "\nprocess start: %s"%process_start.strftime("%c") + \ "\n=============================================\n" print msg sys.stdout.flush() total_lines = 0 # how many total input lines? total_geo_tags = 0 # how many geo tags did we end up with? orig_geo_tags = 0 # how many geo tags did we already have? output_list = [] # list of tweets to send to output file global first_sleep first_sleep = True # first time through, we write to a new file global total_written total_written = 0 # how many rows have we written to the output file file_counter = 0 # which one is this one? output_json_filename = "HTA_reversegeo.json" # read in the files one-by-one # ============================ for input_filename in ["HTA_geotagged.json"]: file_counter+=1 print "---Processing file %d %s"%(file_counter, input_filename) sys.stdout.flush() # check that the file exists if not os.path.isfile(input_filename): msg = "%s does not exist "%input_filename print msg sys.stdout.flush() continue # open the file and read it line-by-line # ====================================== with open(input_filename, "r" ) as infile: file_lines = 0 for line in infile: file_lines+=1 total_lines+=1 # read a line of json try: tweet = json.loads(str(line)) except Exception, e: print "\nat line %d of %s "%(file_lines, input_filename) print repr(e) print "line will not be included in the output file\n" sys.stdout.flush() continue # create the empty field to be added to each record tweet["geo_reverse"] = {"country_code": "", "country": "", "zipcode": "", "city": "", "state": "", "state_abbr": "", "areacode": "", "FIPS": "", "county": "", "Type": "", "Pop_2010": "", "Land_Sq_Mi": ""} if tweet['geo']: lat = tweet['geo']["coordinates"][0] lon = tweet['geo']["coordinates"][1] #print (lat,lon) tweet["geo_reverse"] = geolocate_reverse((lat,lon)) output_list.append(tweet) if total_lines%500 == 0: process_output_file(output_list, output_json_filename) output_list = [] right_now = datetime.datetime.now() print "%s line %d of file %s"%(right_now.strftime("%c"), file_lines, input_filename) sys.stdout.flush()
def main(): import json import datetime import sys, os, io from geolocate_reverse import geolocate_reverse output_json_filename = "HTA_noduplicates.json" input_file_list = ["HTA_reversegeo.json", \ "HTA_reversegeo2.json", \ "HTA_reversegeo3.json", \ "HTA_reversegeo4.json"] process_start = datetime.datetime.now() msg = "\n=============================================" + \ "\nremove duplicates " + \ "\nprocess start: %s"%process_start.strftime("%c") + \ "\n=============================================\n" print msg sys.stdout.flush() input_lines = 0 # how many total input lines? output_lines = 0 # how many output lines? duplicates = 0 # how many duplicates were found? coord_count = 0 # how many coordinate fields did we process? id_set = set() # keep track of unique id's output_list = [] # list of lines to write global first_sleep first_sleep = True # first time through, we write to a new file global total_written total_written = 0 # how many rows have we written to the output file # read in the files one-by-one # ============================ for input_filename in input_file_list: print "---Processing file %s"%input_filename sys.stdout.flush() # check that the file exists if not os.path.isfile(input_filename): msg = "%s does not exist "%input_filename print msg sys.stdout.flush() continue # open the file and read it line-by-line # ====================================== with open(input_filename, "r" ) as infile: file_lines = 0 for line in infile: input_lines+=1 file_lines+=1 # read a line of json try: tweet = json.loads(line) except Exception, e: print "\nat line %d of %s "%(file_lines, input_filename) print repr(e) print "line will not be included in the output file\n" sys.stdout.flush() continue # have we already seen this tweet's id? tweet_id = tweet["id"] if tweet_id in id_set: duplicates+=1 continue id_set.add(tweet_id) # does this tweet have a coordinates field? # https://dev.twitter.com/docs/platform-objects/tweets # reverse geo using it, if it does if tweet['coordinates']: coord_count+=1 lon = tweet['coordinates']["coordinates"][0] lat = tweet['coordinates']["coordinates"][1] tweet["geo_reverse"] = geolocate_reverse((lat,lon)) # print (lat,lon) # print json.dumps(tweet["geo_reverse"],indent=4) # print # add to the output list output_list.append(tweet) output_lines+=1 if output_lines%5000 == 0: process_output_file(output_list, output_json_filename) output_list = [] right_now = datetime.datetime.now() print "%s line %d of file %s"%(right_now.strftime("%c"), file_lines, input_filename) sys.stdout.flush()