def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-07-31" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 1000: #exit() pass address = row[0] #need to fix the number being at the end of the address parts = address.split(',') anumber = parts[-1] parts = parts[:-1] street = ",".join(parts) address = "%s %s" % (anumber, street) invoice_number = row[1] bldg_id = row[1] print bldg_id #this is where owner is stored invoice_note = row[6] print invoice_note if re.match('Sent to:', invoice_note): print "changing invoice note from: %s" % invoice_note invoice_note = invoice_note[8:] print "to: %s" % invoice_note else: #raise ValueError, "invoice note does not start with Sent to" print "!!!!!invoice note does not start with Sent to!!!!!" print "" print "" no_units = row[12] ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ): ## raise ValueError, "Unexpected city: %s" % (ss_city) ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] location.sources = ["google", "bing"] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"] #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units) if invoice_note: (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant") if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print destination = '%s.tsv' % city_tag save_results(locations, destination)
def read_csv(source_csv, city_tag, feed_date): #could also use city.models.find_by_city_state city_options = City.objects.filter(tag=city_tag) #print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" else: city = city_options[0] print city feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name # ideally, should be able to use the database itself as the cache, # instead of using a local file # but it's also good to not have to repeat geo queries if going in bulk # the site code *will* make geo queries # so it's still a good idea to cache the coded address locally # even if using the site code for everything else. cache_file = "%s.json" % city.tag #print cache_file cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) print cache_destination #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: loaded_cache = load_json(cache_destination, create=True) #need to go through and load SearchResults separately local_cache = {} for key in loaded_cache.keys(): #this is useful if there is a cached value #that was not parsed correctly... this will remove it: #if key.strip() == "314 North Washington Street Apt. C": if key.strip() == "some address with bad cached data": print "not adding: ", key #exit() pass else: current = loaded_cache[key] results = current['results'] #print results sr = SearchResults() #sr.from_dict(results, debug=True) sr.from_dict(results, debug=False) #print sr current['results'] = sr #print current['results'] local_cache[key] = current #use street address as the key #for each address, store SearchResults object #reset skips for every run: skips = codecs.open("skips.txt", 'w', encoding='utf-8') skips.close() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) print keys = [] for item in reader.next(): key = item.lower().strip() key = key.replace('(', '') key = key.replace(')', '') key = key.replace('-', '_') key = key.replace('.', '') key = key.replace('/ ', '') key = key.replace('/', '_') key = key.replace('"', '') key = key.replace('#', 'num') key = key.replace(' ', '_') keys.append(key) #*and* the second row in this case print '>, <'.join(keys) #currently: #<street_address>, <unit_if_applicable>, <unit_type>, <rent>, <security_deposit>, <sq_feet_per_unit>, <num_bedrooms>, <num_bathrooms>, <maximum_occupancy_per_unit>, <lease_period>, <availability>, <laundry>, <parking>, <air_conditioning>, <pets>, <gym_fitness_center>, <game_room_rec_center_community_center>, <pool>, <other_amenities>, <bike_friendly>, <recycling>, <composting>, <gardening>, <public_transit>, <walk_friendly>, <other_smartliving_features>, <who_pays_for_electricity>, <who_pays_for_natural_gas>, <who_pays_for_water>, <who_pays_for_trash_recycling_pickup>, <who_pays_for_telephone_land_line>, <who_pays_for_cable>, <who_pays_for_internet>, <electricity_provider>, <electric_utility_cost_average_per_mo>, <electric_utility_cost_low>, <electric_utility_cost_high>, <natural_gas_provider>, <natural_gas_utility_cost_average_per_mo>, <natural_gas_utility_cost_low>, <natural_gas_utility_cost_high>, <energy_saving_features>, <utility_info_source>, <agent_property_manager>, <property_website_url>, <agent_property_manager_address>, <agent_property_manager_phone>, <owner>, <comments> #exit() count = 0 #start = 6439 start = 0 #if you want to randomize the order... to distribute options more evenly #just do this in the original spreadsheet. #in order to randomize, should randomize the order in the csv for row in reader: current = {} count += 1 print "Looking at row: %s" % count #could exit out early here, if needed (for testing) if count > 7220: #all_done(cache_destination, local_cache) pass if count >= start: address = process_row(current, row, keys, local_cache, city, feed_source, count) print local_cache[address] = current #save every time... #never know when a crash will happen: #however, this does make things run considerably slower #especially once the cached file size grows. #save_results(cache_destination, local_cache) #exit() all_done(cache_destination, local_cache)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-07-31" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 1000: #exit() pass address = row[0] #need to fix the number being at the end of the address parts = address.split(',') anumber = parts[-1] parts = parts[:-1] street = ",".join(parts) address = "%s %s" % (anumber, street) invoice_number = row[1] bldg_id = row[1] print bldg_id #this is where owner is stored invoice_note = row[6] print invoice_note if re.match('Sent to:', invoice_note): print "changing invoice note from: %s" % invoice_note invoice_note = invoice_note[8:] print "to: %s" % invoice_note else: #raise ValueError, "invoice note does not start with Sent to" print "!!!!!invoice note does not start with Sent to!!!!!" print "" print "" no_units = row[12] ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ): ## raise ValueError, "Unexpected city: %s" % (ss_city) ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] location.sources = ["google", "bing"] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = [ "google", "bing", "usgeo", "geonames", "openmq", "mq" ] #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units) if invoice_note: (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant") if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print destination = '%s.tsv' % city_tag save_results(locations, destination)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 10: #exit() pass print row address = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() address = address_main print address owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip]) ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] #skip geocoding for columbia location.sources = [] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = ['csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"] #manually add data from csv here: result = [] result.append({'place': address, 'lat': lat, 'lng': lng}) setattr(location, 'csv', result) #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type) if apt_main: unit = make_unit(apt_main, bldg) (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print #exit() destination = '%s.tsv' % city_tag save_results(locations, destination)
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} if not local_cache.has_key('parcels'): local_cache['parcels'] = {} locations = {} for key, value in local_cache['buildings'].items(): locations[key] = Location(value) #geocoder helper: geo = Geo() skips = 0 #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile: with open(source_csv) as csvfile: #reader = csv.reader(csvfile, delimiter=' ', quotechar='|') #reader = csv.reader(csvfile) #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8') reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count #could exit out early here, if needed if count > 10: #exit() pass print row address = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) address_main = " ".join([ street_num, street_dir, street_name, street_sfx, qualifier_pre ]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() address = address_main print address owner_address = ", ".join([ owner_address1, owner_address2, owner_city, owner_state, owner_zip ]) ## #should always be "RENTAL" (don't need to track this one) ## permit_type = row[1] ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL": ## raise ValueError, "Unexpected permit type: %s in row: %s" % ( ## permit_type, row) ## bldg_type = row[2] ## #can use this to filter out non-rental or obsolete entries ## #don't need to track otherwise: ## status = row[3] ## parcel_id = row[4] ## #should be fixed per source: ## ss_city = row[6] ## bldg_sf = row[7] ## no_bldgs = row[8] ## applicant_name = row[9] ## no_stories = row[10] ## no_units = row[11] ## sqft = row[7] ## number_of_buildings = row[8] ## applicant_name = row[9] ## number_of_stories = row[10] ## number_of_units = row[11] #check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: #check if we've started processing any results for this row if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() #temporarily just want to look at google again #location.sources = ["google"] #location.sources = ["google", "bing"] #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"] #skip geocoding for columbia location.sources = [] #do some geocoding, as needed: search = "%s, %s, %s" % (address.upper(), city_name, city.state) any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) #update = geo.lookup(search, geo_source, location, force=False) if update: any_updated = True location.sources = [ 'csv', "google", "bing", "usgeo", "geonames", "openmq", "mq" ] #manually add data from csv here: result = [] result.append({'place': address, 'lat': lat, 'lng': lng}) setattr(location, 'csv', result) #this is the case for brand new searches #(which are updated in a different sense) if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search #location.bldg_units = bldg_units #location.units_bdrms = units_bdrms locations[address.upper()] = location #handle the database storage bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type) if apt_main: unit = make_unit(apt_main, bldg) (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in locations.items(): local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) print #exit() destination = '%s.tsv' % city_tag save_results(locations, destination)
def read_csv(source_csv): city_options = City.objects.filter(tag="bloomington_in") print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = "Bloomington" ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-08-29" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city cache_file = "%s.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) # keep a local copy of data we've processed... # this should help with subsequent calls # to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key("buildings"): local_cache["buildings"] = {} if not local_cache.has_key("parcels"): local_cache["parcels"] = {} locations = {} for key, value in local_cache["buildings"].items(): locations[key] = Location(value) # geocoder helper: geo = Geo() skips = 0 with codecs.open(source_csv, "rb", encoding="utf-8") as csvfile: # reader = csv.reader(csvfile, delimiter=' ', quotechar='|') reader = csv.reader(csvfile) # just print the first row: print ">, <".join(reader.next()) count = 0 for row in reader: count += 1 print "Looking at row: %s" % count # could exit out early here, if needed if count > 1000: # exit() pass bldg_id = row[0] print bldg_id address = row[1] print address owner = row[2] # skip this: ownder_contact = row[3] agent = row[4] bldg_units = row[9] print bldg_units units_bdrms = row[10] print units_bdrms # check if this is one we want to skip if conversions.has_key(address.upper()): address = conversions[address.upper()] # make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 else: if locations.has_key(address.upper()): location = locations[address.upper()] else: location = Location() # temporarily just want to look at google again location.sources = ["google"] # do some geocoding, as needed: search = "%s, Bloomington IN" % address.upper() any_updated = False for geo_source in location.sources: update = geo.lookup(search, geo_source, location, force=True) if update: any_updated = True location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"] if not hasattr(location, "address_alt") or not location.address_alt: any_updated = True location.address_alt = search location.bldg_units = bldg_units location.units_bdrms = units_bdrms locations[address.upper()] = location # handle the database storage bldg = make_building(location, bldg_id, city, feed_source) # owner_details = parse_person(owner) if owner: result = special_cases(owner) if result: (owner_name, owner_address) = result else: (owner_name, owner_address, owner_phone, remainder) = parse_person(owner) ## print "owner name: %s" % owner_name ## print "owner address: %s" % owner_address ## print "" if owner_name: (person, bldg_person) = make_person(owner_name, bldg, "Owner", address=owner_address) if agent and agent != "No Agent": # agent_details = parse_person(agent) (agent_name, agent_address, agent_phone, remainder) = parse_person(agent) ## print "agent name: %s" % agent_name ## print "agent address: %s" % agent_address ## print "" if agent_name: (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, city=city) if any_updated: # back it up for later # enable this when downloading GPS coordinates... # the rest of the time it slows things down local_cache["buildings"] = {} for key, value in locations.items(): local_cache["buildings"][key] = value.to_dict() save_json(cache_destination, local_cache) print save_results(locations, "bloomington-filtered.tsv")
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s-20150525.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) # keep a local copy of data we've processed... # this should help with subsequent calls # to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key("buildings"): local_cache["buildings"] = {} search_results = {} for key, value in local_cache["buildings"].items(): # search_results[key] = Location(value) sr = SearchResults() sr.from_dict(value) # print # print sr # print search_results[key] = sr # geocoder helper: # geo = Geo() skips = 0 with open(source_csv) as csvfile: reader = unicode_csv_reader(csvfile) # just print the first row: print ">, <".join(reader.next()) count = 0 # want to randomize the order... distribute options more evenly # print len(reader) # exit() # in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count any_updated = False # could exit out early here, if needed if count > 10: # exit() pass # if you want to skip ahead more quickly: if count < 27187: pass else: # print row objectid = row[0] ## no_units = row[12] # can pass this in as bldg_id to make_building # that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] # eg building number qualifier_pre = row[6] # eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] # skip row9 (in/out... whatever that means) zip_code = row[10] # skip row11, assessor id # skip row12, address num # skip row13, x # skip row14, y # xcoord == lng lng = row[15] lat = row[16] # entry floor number: (named 'z' in sheet) floor = row[17] # skip row18, strcid... not sure # skip row19, parent # skip row20, app_ # skip row21, hteloc zone = row[22] bldg_type = row[23] # number of buildings bldg_num = row[24] no_units = row[25] # skip row[26], inspection type # skip row27, app number # skip row28, date received # skip row29, application type # skip row30, ownerid # skip row31, operator id # skip row32, agent_id # skip row33, mail to central_heat = row[34] if central_heat == "Y": central_heat = True else: central_heat = False # heat mechanism? heat mechanic??? not sure heat_mech = row[35] # skip row36, agent id (2) # skip row37, agent last name # skip row38 agent first name # skip row39 agent middle initial # skip row40, agent title # skip row41, business name # could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] # address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) # this is causing problems with lookups in google if ( qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN" ): qualifier_pre = "" address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() # get rid of any double spaces address_main = address_main.replace(" ", " ") # similar to conversions, # but there are too many of these to list there if re.search("HOLLY RIDGE LN", address_main): address_main = address_main.replace("HOLLY RIDGE LN", "HOLLYRIDGE LN") if re.search("BERKSHIRE CT", address_main): address_main = address_main.replace("BERKSHIRE CT", "BERKSHIRE") # address_main = '' if re.search("CAMERON CT", address_main): address_main = address_main.replace("CAMERON CT", "CAMERON") # address_main = '' if re.search("ATHENS CT", address_main): address_main = address_main.replace("ATHENS CT", "ATHENS") # address_main = '' if re.search("LAMAR CT", address_main): address_main = address_main.replace("LAMAR CT", "LAMAR") # address_main = '' if re.search("MONITEAU CT", address_main): address_main = address_main.replace("MONITEAU CT", "MONITEAU") # address_main = '' if re.search("IMPERIAL CT", address_main): address_main = "" if re.search("PERKINS DR", address_main): address_main = "" if re.search("GRANITE OAKS CT", address_main): address_main = "" # sometimes the 'BLDG' data is added in the wrong place # then it gets treated as a unit item # (but it's not *always* a unit item, so can't generalize it that way) if qualifier_post == "BLDG" or qualifier_post == "LOT": address_main = " ".join([address_main, qualifier_post, apt_main]) address_main = address_main.strip() apt_main = "" else: apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() # check if this is one we want to skip if conversions.has_key(address_main.upper()): address_main = conversions[address_main.upper()] if address_main: print "APT_MAIN: ", apt_main address = ", ".join([address_main, apt_main]) else: address = "" owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip]) ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): print "Parcel ID:", parcel_id print address results = None # make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 skipf = codecs.open("skips.txt", "a", encoding="utf-8") original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) skipf.write(original) skipf.write("\n") skipf.close() else: # check if we've started processing any results for this row if search_results.has_key(address.upper()): print "Already had building: %s" % address results = search_results[address.upper()] # print results else: addy = ", ".join([address_main, city.name, city.state]) addy += " " + zip_code # addy += ", USA" print addy # toggle betweeen an actual google query results = address_search(addy, apt_main) # print dir(results) if len(results.matches) > 1: print results for option in results.matches: print "%s: %s, %s" % (option["place"], option["lat"], option["lng"]) print print "Source Lat: %s, Lng: %s" % (lat, lng) src_lat = int(float(lat) * 100) src_lng = int(float(lng) * 100) matched = False for current in results.matches: # current = results.matches[0] print current["lat"] print current["lng"] # only want to look at the first 2 decimal places: comp_lat = int(float(current["lat"]) * 100) comp_lng = int(float(current["lng"]) * 100) print comp_lat print comp_lng if (src_lat == comp_lat) and (src_lng == comp_lng): # results.matches = results.matches[:1] results.matches = [current] matched = True if not matched: print "DIDN'T MATCH!" exit() any_updated = True # or just using results as specified in csv # (THIS DOES NOT NORMALIZE THE ADDRESS VIA GOOGLE) # results = SearchResults() # results.unit_text = apt_main # handle_place(results, addy, lat, lng, apt_main) assert results # print results lookup_building_with_geo(results, make=True, parcel_id=parcel_id) # print results # current['results'] = results # print results if results.errors: print results raise ValueError, results.errors else: search_results[address.upper()] = results bldg = results.building assert bldg unit = results.unit # may be a case where the unit is blank # and another unit with an number/letter was created earlier # in that case, we won't be creating one here # and the building will already exist... # not necessarily an error though # just redundant data # assert unit (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) # time.sleep(1) if any_updated: # back it up for later # enable this when downloading GPS coordinates... # the rest of the time it slows things down local_cache["buildings"] = {} for key, value in search_results.items(): # search_results[key] = SearchResults().from_dict(value) local_cache["buildings"][key] = value.to_dict() save_json(cache_destination, local_cache) print