def read_csv(source_csv, city_name, city_tag, driver): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city position_file = "position.json" position = load_json(position_file, create=True) if not position: position = 0 cache_file = "%s-20150525.json.bkup" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) #keep a local copy of data we've processed... #this should help with subsequent calls #to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key('buildings'): local_cache['buildings'] = {} search_results = {} for key, value in local_cache['buildings'].items(): #search_results[key] = Location(value) sr = SearchResults() sr.from_dict(value) #print #print sr #print search_results[key] = sr #geocoder helper: #geo = Geo() provider = '' provider_options = ServiceProvider.objects.filter(name='City of Columbia') if len(provider_options): provider = provider_options[0] else: raise ValueError, "error finding utility_provider: %s matches" % len(provider_options) skips = 0 with open(source_csv) as csvfile: reader = unicode_csv_reader(csvfile) #just print the first row: print '>, <'.join(reader.next()) count = 0 #want to randomize the order... distribute options more evenly #print len(reader) #exit() #in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s, position: %s" % (count, position) start = datetime.now() print "Started: ", start any_updated = False #could exit out early here, if needed if count > 10: #exit() pass #if you want to skip ahead more quickly: #if count < 0: if count < position: pass else: #print row objectid = row[0] ## no_units = row[12] #can pass this in as bldg_id to make_building #that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] #eg building number qualifier_pre = row[6] #eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] #skip row9 (in/out... whatever that means) zip_code = row[10] #skip row11, assessor id #skip row12, address num #skip row13, x #skip row14, y #xcoord == lng lng = row[15] lat = row[16] #entry floor number: (named 'z' in sheet) floor = row[17] #skip row18, strcid... not sure #skip row19, parent #skip row20, app_ #skip row21, hteloc zone = row[22] bldg_type = row[23] #number of buildings bldg_num = row[24] no_units = row[25] #skip row[26], inspection type #skip row27, app number #skip row28, date received #skip row29, application type #skip row30, ownerid #skip row31, operator id #skip row32, agent_id #skip row33, mail to central_heat = row[34] if central_heat == 'Y': central_heat = True else: central_heat = False #heat mechanism? heat mechanic??? not sure heat_mech = row[35] #skip row36, agent id (2) #skip row37, agent last name #skip row38 agent first name #skip row39 agent middle initial #skip row40, agent title #skip row41, business name #could be owner, could be agent ## owner_name = row[42] ## owner_address1 = row[43] ## owner_address2 = row[44] ## owner_city = row[45] ## owner_state = row[46] ## owner_zip = row[47] #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) #this is causing problems with lookups in google if qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN": qualifier_pre = '' address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() #get rid of any double spaces address_main = address_main.replace(" ", " ") #similar to conversions, #but there are too many of these to list there if re.search('HOLLY RIDGE LN', address_main): address_main = address_main.replace('HOLLY RIDGE LN', 'HOLLYRIDGE LN') if re.search('BERKSHIRE CT', address_main): address_main = address_main.replace('BERKSHIRE CT', 'BERKSHIRE') #address_main = '' if re.search('CAMERON CT', address_main): address_main = address_main.replace('CAMERON CT', 'CAMERON') #address_main = '' if re.search('ATHENS CT', address_main): address_main = address_main.replace('ATHENS CT', 'ATHENS') #address_main = '' if re.search('LAMAR CT', address_main): address_main = address_main.replace('LAMAR CT', 'LAMAR') #address_main = '' if re.search('MONITEAU CT', address_main): address_main = address_main.replace('MONITEAU CT', 'MONITEAU') #address_main = '' if re.search('IMPERIAL CT', address_main): address_main = '' if re.search('PERKINS DR', address_main): address_main = '' if re.search('GRANITE OAKS CT', address_main): address_main = '' #sometimes the 'BLDG' data is added in the wrong place #then it gets treated as a unit item #(but it's not *always* a unit item, so can't generalize it that way) if qualifier_post == "BLDG" or qualifier_post == "LOT": address_main = " ".join([address_main, qualifier_post, apt_main]) address_main = address_main.strip() apt_main = '' else: apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() #check if this is one we want to skip if conversions.has_key(address_main.upper()): address_main = conversions[address_main.upper()] if address_main: print "APT_MAIN: ", apt_main address = ", ".join( [address_main, apt_main] ) ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): print "Parcel ID:", parcel_id print address results = None #make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 ## skips = codecs.open("skips.txt", 'a', encoding='utf-8') ## original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) ## skips.write(original) ## skips.write('\n') ## skips.close() #check if we've started processing any results for this row elif not search_results.has_key(address.upper()): print "No saved search results for address: %s" % address print "Skipping." print #raise ValueError, "No results found for %s" % address else: print "Already had building: %s" % address results = search_results[address.upper()] assert results #print results lookup_building_with_geo(results, make=True, parcel_id=parcel_id) #print results #current['results'] = results #print results if results.errors: print results raise ValueError, results.errors else: bldg = results.building assert bldg unit = results.unit #at this point there should be at least one unit #and we will want to associate results with that unit #assert unit # can just pass this up in this case if not unit: print "Skipping address... no matching Unit!" else: #now that we have a building #look up energy data on the remote website #result = urllib2.urlopen("http://example.com/foo/bar") #print result.read() ## base = "http://www.gocolumbiamo.com/cfforms/ub/rental.html" ## driver.get(base) ## search = driver.find_element_by_css_selector('#address') ## search.send_keys(address) ## button = driver.find_element_by_css_selector('.ui-bar > a:nth-child(2)') ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b') ## #button = driver.find_element_by_css_selector('#PrimaryCenterColumn > div > div.ui-bar-b.ui-header > div > a.ui-btn.ui-btn-corner-all.ui-shadow.ui-btn-down-b.ui-btn-up-b > span > span') ## button.click() ## time.sleep(4) ## #results = driver.find_element_by_css_selector('.dojoxGridMasterView') ## results = driver.find_element_by_css_selector('.dojoxGridContent > div:nth-child(1)') ## print results.get_attribute('innerHTML') ## print parcel_id ## options = results.find_elements_by_tag_name('div') ## #options = results.find_elements_by_link_text(parcel_id) ## print options ## #something didn't work with this: ## #look_for = '<td tabindex="-1" role="gridcell" colspan="1" class="dojoxGridCell" idx="0" style="width:90px;">%s</td>' % parcel_id ## look_for = '>%s<' % parcel_id ## matches = [] ## for option in options: ## markup = option.get_attribute('innerHTML') ## #print markup ## if re.search(look_for, markup): ## matches.append(option) ## #print "MATCH!" ## if len(matches) > 1: ## print matches ## raise ValueError, "Too many matches!" ## else: ## matches[0].click() #just realized that this form uses the property_id #which we already have... #can skip the steps above that are trying to make this link: base = "http://www.gocolumbiamo.com/cfforms/ub/ubdata.cfm?LOCID=%s&AppNum=79" % parcel_id driver.get(base) try: heat_source = driver.find_element_by_css_selector('#PrimaryCenterColumn > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(1) > strong:nth-child(1) > font:nth-child(1)') if heat_source.text.strip() == "Heating Source: Gas Heat": bldg.heat_source_details = 'gas' bldg.save() else: print heat_source.text exit() #TODO: bldg.heat_source_details = 'electric' bldg.who_pays_gas = 'not_available' except: print "heat source not found... skipping" try: selector = driver.find_element_by_css_selector('#el_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)') selector.click() except: print "No Water data available... skipping" else: body = driver.find_element_by_css_selector('#el_table > tbody:nth-child(3)') rows = body.find_elements_by_tag_name('tr') #row = rows[0] query = bldg.utilitysummary_set.filter(type='electricity') for row in rows: #print row.get_attribute('innerHTML') cols = row.find_elements_by_tag_name('td') date = cols[0].text + '-01' cost = cols[1].text.replace('$', '').strip() amount = cols[2].text amount = amount.replace(' KWH', '') update_summary(query, date, cost, amount, bldg, unit, provider, 'electricity', 'kwh') #update_summary(query, date, cost, amount) #for item in cols: # print item.text #print dir(bldg) #print bldg.utilitysummary_set #query = bldg.utilitysummary_set.filter(type=utility_type[0]) #could look up type from UTILITY_TYPES... #but in this case we know what they should be #query = bldg.utilitysummary_set.filter(type='water') #if len(query): try: water = driver.find_element_by_css_selector('#ext-gen23') water.click() selector = driver.find_element_by_css_selector('#wr_table_length > label:nth-child(1) > select:nth-child(1) > option:nth-child(3)') selector.click() except: print "No Water data available... skipping" else: body = driver.find_element_by_css_selector('#wr_table > tbody:nth-child(3)') rows = body.find_elements_by_tag_name('tr') #row = rows[0] query = bldg.utilitysummary_set.filter(type='water') for row in rows: #print row.get_attribute('innerHTML') cols = row.find_elements_by_tag_name('td') date = cols[0].text + '-01' cost = cols[1].text.replace('$', '').strip() amount = cols[2].text amount = amount.replace(' CCF', '') update_summary(query, date, cost, amount, bldg, unit, provider, 'water', 'ccf') #update_summary(query, date, cost, amount) #for item in cols: # print item.text unit.update_averages() #see if we have enough info now to make a score: unit.update_energy_score() #now that we've saved the unit, #update the averages for the whole building: unit.building.update_utility_averages() unit.building.update_rent_details() position += 1 save_json(position_file, position) if any_updated: #back it up for later #enable this when downloading GPS coordinates... #the rest of the time it slows things down local_cache['buildings'] = {} for key, value in search_results.items(): #search_results[key] = SearchResults().from_dict(value) local_cache['buildings'][key] = value.to_dict() save_json(cache_destination, local_cache) position = count save_json(position_file, position) exit() end = datetime.now() print "finished: ", end total_time = end - start print total_time print
def process_row(current, row, keys, local_cache, city, feed_source, count): """ work on adding all of the details from one row to the matching building address """ #print row for index, key in enumerate(keys): current[key] = row[index] #print current results = None address = current['street_address'] #if unit is in second column, need it here... #otherwise everything gets over-written for address in local_cache: if current['unit_if_applicable']: address = ', '.join( [address, current['unit_if_applicable']] ) if conversions.has_key(address): address = conversions[address] print "Using manually fixed address: %s" % address if address in local_cache.keys(): print "local_cache matched ", address previous = local_cache[address] #print previous results = previous['results'] #should have already been set when loading local_cache above: #now load it as an actual SearchResults object #results = SearchResults() #print "found matching results: %s" % results else: #do the search for the first time #get rid of any '*' characters... #these are not really part of the address: addy = address.replace('*', '') addy = addy.strip() #seeing units in street address with no '#' or other prefix #but it is separated by a comma... #extract that here and add a prefix (and leave out ',') parts = addy.split(',') unit = '' if len(parts) > 1: #treat last part as a unit unit = parts[-1].strip() addy = ",".join(parts[:-1]).strip() #check if we have both current['unit_if_applicable'] #and found unit if unit and current['unit_if_applicable']: if unit != current['unit_if_applicable']: raise ValueError, "Found both unit: %s and unit from spreadsheet: %s" % (unit, current['unit_if_applicable']) #otherwise it should be ok... #adding in unit_if_available earlier now else: unit = current['unit_if_applicable'] #also need to add in city, state, here to help limit matches addy = ", ".join( [addy, city.name, city.state] ) print addy results = address_search(addy, unit) assert results lookup_building_with_geo(results, make=True) #print results current['results'] = results print results if results.errors: #print results #raise ValueError, results.errors skips = codecs.open("skips.txt", 'a', encoding='utf-8') skips.write(address) skips.write('\n') skips.close() else: bldg = results.building assert bldg unit = results.unit assert unit #not sure that the building form is going to save very much effort #still need to customize validation #skipping for now ## buildingform = BuildingForm(instance=bldg) ## print dir(buildingform) ## print buildingform.fields.keys() #this would come at the end, if using form: #setattr(buildingform, model_attribute, value) #buildingform.fields[model_attribute].initial = value ## #use form validation to make sure no errors are missed ## if buildingform.is_valid(): ## updated = buildingform.save(commit=True) ## else: ## print buildingform.errors ## print buildingform._errors ## for field in buildingform: ## print dir(field) ## print field.errors ## print "ERRORS!" #Now update the unit and building details as necessary: #building bldg_map = { "unit_type":"type", "laundry":"laundry", "parking":"parking_options", "pets":"pets", "gym_fitness_center":"gym", "game_room_rec_center_community_center":"game_room", "pool":"pool", "other_amenities":"amenities", "bike_friendly":"bike_friendly_details", "recycling":"recycling", "composting":"composting", "gardening":"garden_details", "public_transit":"transit_friendly_details", "walk_friendly":"walk_friendly_details", "other_smartliving_features":"energy_saving_details", "air_conditioning":"air_conditioning", "energy_saving_features":"energy_saving_other" } #bldg_map = { "laundry":"laundry", "bike_friendly":"bike_friendly" } #now use the keys for sk, model_attribute in bldg_map.items(): #have already converted to a dict #row_index = keys.index(spreadsheet) #value = row value = current[sk] clean = [] #now need to do any field specific conversions... #this boils down to massive case statement #(but only for those that actually need it) #values set on the model will get automatically converted if sk == "laundry": (clean, rest) = check_choices(bldg.LAUNDRY_CHOICES, value) if re.search('W/D incl\. in unit', rest): rest = rest.replace('W/D incl. in unit', '') clean.append('in_unit') #print clean #print rest value = ','.join(clean) if sk == "air_conditioning": (clean, rest) = check_choices(bldg.AC_CHOICES, value) #print clean #print rest value = ','.join(clean) if sk == "recycling": value = check_boolean(value) if sk == "pets": value = check_boolean(value) if sk == "unit_type": (clean, rest) = check_choices(bldg.TYPE_CHOICES, value) #print clean #print rest #should only have one building type! value = ','.join(clean) if sk == "bike_friendly": (clean, rest) = check_choices(bldg.BIKE_CHOICES, value) #print clean #print rest #value = ','.join(clean) value = clean if rest: bldg.bike_friendly_other = rest if sk == "public_transit": (clean, rest) = check_choices(bldg.TRANSIT_CHOICES, value) print clean print rest #value = ','.join(clean) value = clean if rest: bldg.transit_friendly_other = rest if sk == "parking": print value (clean, rest) = check_choices(bldg.PARKING_CHOICES, value) print clean print rest #value = ','.join(clean) value = clean if rest: #bldg.transit_friendly_other = rest raise ValueError, "Unknown parking option: %s" % rest if sk == "other_smartliving_features": (clean, rest) = check_choices(bldg.ENERGY_SAVING_CHOICES, value) #print clean #print rest #value = ','.join(clean) value = clean if rest: bldg.energy_saving_other = rest if sk == "energy_saving_features": #this values shows up here in the spreadsheet #that is incorrect... bad data... #this is a fix for that if "Near Bus Route" == value: #multiselectfield returns a list automatically: #cur_values = bldg.transit_friendly_details.split() cur_values = bldg.transit_friendly_details if not 'access' in cur_values: print "Adding access to transit friendly details" cur_values.append('access') #total = ','.join(cur_values) #bldg.transit_friendly_details = total bldg.transit_friendly_details = cur_values (clean, rest) = check_choices(bldg.ENERGY_SAVING_CHOICES, value) #print clean #print rest #value = ','.join(clean) value = clean if rest: #might loose some data here if both other_smartliving_features and this are set with different data bldg.energy_saving_other = rest if sk == "gardening": (clean, rest) = check_choices(bldg.GARDEN_CHOICES, value) #print clean #print rest #value = ','.join(clean) value = clean if rest: bldg.garden_other = rest if sk == "walk_friendly": (clean, rest) = check_choices(bldg.WALK_CHOICES, value) print clean print rest #value = ','.join(clean) value = clean if rest: bldg.walk_friendly_other = rest if sk == "gym_fitness_center": value = check_boolean(value) if sk == "pool": value = check_boolean(value) if sk == "game_room_rec_center_community_center": value = check_boolean(value) print "Setting %s (currently: %s) to: %s" % (model_attribute, getattr(bldg, model_attribute), value) setattr(bldg, model_attribute, value) #update values based on anything that was added here bldg.set_booleans() who_pays = { "who_pays_for_electricity":"who_pays_electricity", "who_pays_for_natural_gas":"who_pays_gas", "who_pays_for_water":"who_pays_water", "who_pays_for_trash_recycling_pickup":"who_pays_trash", "who_pays_for_cable":"who_pays_cable", "who_pays_for_internet":"who_pays_internet", } for sk, model_attribute in who_pays.items(): value = current[sk] (value, rest) = check_who_pays(value, bldg.WHO_PAYS_CHOICES) #print current[sk] #print rest if rest: raise ValueError, "Unknown who pays value: %s" % value print "Setting %s (currently: %s) to: %s" % (model_attribute, getattr(bldg, model_attribute), value) setattr(bldg, model_attribute, value) #unit: numbers = {"rent":"rent", "security_deposit":"deposit", "sq_feet_per_unit":"sqft", "num_bedrooms":"bedrooms", "num_bathrooms":"bathrooms", "maximum_occupancy_per_unit":"max_occupants", "electric_utility_cost_average_per_mo":"average_electricity", "electric_utility_cost_low":"electricity_min", "electric_utility_cost_high":"electricity_max", "natural_gas_utility_cost_average_per_mo":"average_gas", "natural_gas_utility_cost_low":"gas_min", "natural_gas_utility_cost_high":"gas_max", } for sk, model_attribute in numbers.items(): value = current[sk] clean = [] value = check_number(value) #print value #print clean #print rest #if rest: # raise ValueError, "Unknown who pays value: %s" % value if value: print "Setting %s (currently: %s) to: %s" % (model_attribute, getattr(unit, model_attribute), value) setattr(unit, model_attribute, float(value)) #else: # print "SKIPPING: %s" % value #agents = { "agent_property_manager":"agent_property_manager", "property_website_url":"property_website_url", "agent_property_manager_address":"agent_property_manager_address", "agent_property_manager_phone":"agent_property_manager_phone", "owner":"owner", } agent_name = current["agent_property_manager"].strip() agent_site = current["property_website_url"].strip() #special case: if agent_site == "http://parkermgt.com/": pass elif re.search('parkermgt', agent_site): bldg.website = agent_site agent_site = "http://parkermgt.com/" agent_address = current["agent_property_manager_address"].strip() agent_phone = current["agent_property_manager_phone"].strip() owner = current["owner"].strip() if agent_name or agent_site or agent_address: (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, website=agent_site, phone=agent_phone) print "created/matched agent: %s" % person.name print person if owner: (owner_person, obldg_person) = make_person(owner, bldg, "Owner") #missing: #heat_source, renewable_energy #other (skip) #for listing: #"lease_period":"lease_period", "availability":"availability", #for utility #"electricity_provider":"electricity_provider", "natural_gas_provider":"natural_gas_provider", "utility_info_source":"utility_info_source", "who_pays_for_telephone_land_line":"who_pays_for_telephone_land_line", #"comments" #energy_saving_features not used consistently, used very similarly to "other_smartliving_features" #not sure if this is the right conversion: #elif low == 'some exceptions': # clean = True bldg.source = feed_source bldg.geocoder = "google" bldg.save() unit.save() bldg.update_utility_averages() bldg.update_rent_details() return address
def read_csv(source_csv, city_name, city_tag): city_options = City.objects.filter(tag=city_tag) print "Number of cities available: %s" % len(city_options) if not len(city_options): raise ValueError, "CITY NOT FOUND! run make_cities.py first" ## city = City() ## city.name = city_name ## city.tag = to_tag(city.name) ## city.save() else: city = city_options[0] print city feed_date = "2013-10-16" feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date) if feeds.exists(): feed = feeds[0] print "Already had feed: %s, %s" % (feed.city, feed.added) else: feed = FeedInfo() feed.city = city feed.added = feed_date feed.version = "0.1" feed.save() print "Created new feed: %s" % feed.city.name people = Person.objects.filter(name="Blank") if people.exists(): person = people[0] print "Already had person: %s" % (person.name) else: person = Person() person.name = "Blank" person.save() print "Created new person: %s" % person.name sources = Source.objects.filter(feed=feed) if sources.exists(): feed_source = sources[0] print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added) else: feed_source = Source() feed_source.feed = feed feed_source.person = person feed_source.save() print "Created new source: %s" % feed_source.feed.city.name cache_file = "%s-20150525.json" % city.tag cache_destination = os.path.join(os.path.dirname(source_csv), cache_file) # keep a local copy of data we've processed... # this should help with subsequent calls # to make sure we don't need to duplicate calls to remote geolocation APIs: local_cache = load_json(cache_destination, create=True) if not local_cache.has_key("buildings"): local_cache["buildings"] = {} search_results = {} for key, value in local_cache["buildings"].items(): # search_results[key] = Location(value) sr = SearchResults() sr.from_dict(value) # print # print sr # print search_results[key] = sr # geocoder helper: # geo = Geo() skips = 0 with open(source_csv) as csvfile: reader = unicode_csv_reader(csvfile) # just print the first row: print ">, <".join(reader.next()) count = 0 # want to randomize the order... distribute options more evenly # print len(reader) # exit() # in order to randomize, should randomize the order in the csv for row in reader: count += 1 print "Looking at row: %s" % count any_updated = False # could exit out early here, if needed if count > 10: # exit() pass # if you want to skip ahead more quickly: if count < 27187: pass else: # print row objectid = row[0] ## no_units = row[12] # can pass this in as bldg_id to make_building # that gets used for parcel too parcel_id = row[1] bldg_id = parcel_id street_num = row[2] street_dir = row[3] street_name = row[4] street_sfx = row[5] # eg building number qualifier_pre = row[6] # eg "UNIT" or "APT" qualifier_post = row[7] apt_num = row[8] # skip row9 (in/out... whatever that means) zip_code = row[10] # skip row11, assessor id # skip row12, address num # skip row13, x # skip row14, y # xcoord == lng lng = row[15] lat = row[16] # entry floor number: (named 'z' in sheet) floor = row[17] # skip row18, strcid... not sure # skip row19, parent # skip row20, app_ # skip row21, hteloc zone = row[22] bldg_type = row[23] # number of buildings bldg_num = row[24] no_units = row[25] # skip row[26], inspection type # skip row27, app number # skip row28, date received # skip row29, application type # skip row30, ownerid # skip row31, operator id # skip row32, agent_id # skip row33, mail to central_heat = row[34] if central_heat == "Y": central_heat = True else: central_heat = False # heat mechanism? heat mechanic??? not sure heat_mech = row[35] # skip row36, agent id (2) # skip row37, agent last name # skip row38 agent first name # skip row39 agent middle initial # skip row40, agent title # skip row41, business name # could be owner, could be agent owner_name = row[42] owner_address1 = row[43] owner_address2 = row[44] owner_city = row[45] owner_state = row[46] owner_zip = row[47] # address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num]) # this is causing problems with lookups in google if ( qualifier_pre == "DUP" or qualifier_pre == "DUPE" or qualifier_pre == "2-Jan" or qualifier_pre == "HM" or qualifier_pre == "DWN" ): qualifier_pre = "" address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) address_main = address_main.strip() # get rid of any double spaces address_main = address_main.replace(" ", " ") # similar to conversions, # but there are too many of these to list there if re.search("HOLLY RIDGE LN", address_main): address_main = address_main.replace("HOLLY RIDGE LN", "HOLLYRIDGE LN") if re.search("BERKSHIRE CT", address_main): address_main = address_main.replace("BERKSHIRE CT", "BERKSHIRE") # address_main = '' if re.search("CAMERON CT", address_main): address_main = address_main.replace("CAMERON CT", "CAMERON") # address_main = '' if re.search("ATHENS CT", address_main): address_main = address_main.replace("ATHENS CT", "ATHENS") # address_main = '' if re.search("LAMAR CT", address_main): address_main = address_main.replace("LAMAR CT", "LAMAR") # address_main = '' if re.search("MONITEAU CT", address_main): address_main = address_main.replace("MONITEAU CT", "MONITEAU") # address_main = '' if re.search("IMPERIAL CT", address_main): address_main = "" if re.search("PERKINS DR", address_main): address_main = "" if re.search("GRANITE OAKS CT", address_main): address_main = "" # sometimes the 'BLDG' data is added in the wrong place # then it gets treated as a unit item # (but it's not *always* a unit item, so can't generalize it that way) if qualifier_post == "BLDG" or qualifier_post == "LOT": address_main = " ".join([address_main, qualifier_post, apt_main]) address_main = address_main.strip() apt_main = "" else: apt_main = " ".join([qualifier_post, apt_num]) apt_main = apt_main.strip() # check if this is one we want to skip if conversions.has_key(address_main.upper()): address_main = conversions[address_main.upper()] if address_main: print "APT_MAIN: ", apt_main address = ", ".join([address_main, apt_main]) else: address = "" owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip]) ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']): print "Parcel ID:", parcel_id print address results = None # make sure it's not one we're skipping: if not address: print "SKIPPING ITEM: %s" % row[1] skips += 1 skipf = codecs.open("skips.txt", "a", encoding="utf-8") original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre]) skipf.write(original) skipf.write("\n") skipf.close() else: # check if we've started processing any results for this row if search_results.has_key(address.upper()): print "Already had building: %s" % address results = search_results[address.upper()] # print results else: addy = ", ".join([address_main, city.name, city.state]) addy += " " + zip_code # addy += ", USA" print addy # toggle betweeen an actual google query results = address_search(addy, apt_main) # print dir(results) if len(results.matches) > 1: print results for option in results.matches: print "%s: %s, %s" % (option["place"], option["lat"], option["lng"]) print print "Source Lat: %s, Lng: %s" % (lat, lng) src_lat = int(float(lat) * 100) src_lng = int(float(lng) * 100) matched = False for current in results.matches: # current = results.matches[0] print current["lat"] print current["lng"] # only want to look at the first 2 decimal places: comp_lat = int(float(current["lat"]) * 100) comp_lng = int(float(current["lng"]) * 100) print comp_lat print comp_lng if (src_lat == comp_lat) and (src_lng == comp_lng): # results.matches = results.matches[:1] results.matches = [current] matched = True if not matched: print "DIDN'T MATCH!" exit() any_updated = True # or just using results as specified in csv # (THIS DOES NOT NORMALIZE THE ADDRESS VIA GOOGLE) # results = SearchResults() # results.unit_text = apt_main # handle_place(results, addy, lat, lng, apt_main) assert results # print results lookup_building_with_geo(results, make=True, parcel_id=parcel_id) # print results # current['results'] = results # print results if results.errors: print results raise ValueError, results.errors else: search_results[address.upper()] = results bldg = results.building assert bldg unit = results.unit # may be a case where the unit is blank # and another unit with an number/letter was created earlier # in that case, we won't be creating one here # and the building will already exist... # not necessarily an error though # just redundant data # assert unit (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address) # time.sleep(1) if any_updated: # back it up for later # enable this when downloading GPS coordinates... # the rest of the time it slows things down local_cache["buildings"] = {} for key, value in search_results.items(): # search_results[key] = SearchResults().from_dict(value) local_cache["buildings"][key] = value.to_dict() save_json(cache_destination, local_cache) print