def address(address): ''' Given an address, use the Geocode API to retrieve tract data and lat/long data which is passed on into the respective compute scores function. Furthermore, the function also calls other functions so that graphics can be generated ''' result = None tract = None while result is None: try: result = cg.address(address, city='Chicago', state='IL') except: pass while tract is None: try: tract = int(result[0]['geographies']\ ['2010 Census Blocks'][0]['TRACT']) except: result = cg.address(address, city='Chicago', state='IL') lat = float(result[0]['geographies']['2010 Census Blocks'][0]['INTPTLAT']) longi = float(result[0]['geographies']['2010 Census Blocks'][0]['INTPTLON']) ed_score = edu.eduscore(lat,longi,1000) sa_score = sani.get_sanitation_score(longi, lat) racedist(tract, tracttocomm) incomedist(tract, tracttocomm) agedist(tract, tracttocomm) return lat, longi, ed_score, sa_score
def geocode_util(num, name, typ, city, state, zipcode): # time.sleep(1) addr = str(num) + " " + str(name) + " " + str(typ) try: result = cg.address(addr, city=city, state=state, zipcode=zipcode, returntype='geographies') tract = result[0]['geographies']['Census Tracts'][0]['GEOID'] except: tract = None return tract
def confirm_address(): data = (cg.address(request.form.get('address'), city=request.form.get('city'), state=request.form.get('state'), zipcode=request.form.get('zipcode'))) id = [] context = { 'data': data, } if request.method == "POST": address = Address(address=data[0]['matchedAddress'], census_tract=data[0]['geographies']['Census Tracts'][0]['BASENAME'], latitude=data[0]['geographies']['Census Tracts'][0]['CENTLAT'], longitude=data[0]['geographies']['Census Tracts'][0]['CENTLON'], zip=data[0]['addressComponents']['zip']) db.session.add(address) db.session.commit() id = address.id session['confirm_address_id']=id print(session['confirm_address_id']) return render_template('confirm_address.html', **context)
def census_geocode(row, i): g = () try: g = cg.address(street=str(row['street']), city=str(row['city']), state=str(row['state']), zipcode=str(row['zip'])) except: g = () finally: if len(g) > 0: data = g[0] row['lat'] = data['coordinates']['y'] row['lng'] = data['coordinates']['x'] if i % 1000 == 0: print(i) return row
geodata['AREALAND'] = '' geodata['INTPTLON'] = '' geodata['MTFCC'] = '' geodata['LWBLKTYP'] = '' geodata['COUNTY'] = '' # Measure Starting time startingtime = time.time() # Info print('#######\tstart index: {}\n#######\tstart time: {}\n#######\tdestination folder: {}\n'.format(start, strftime('%H%M', gmtime()), destinationpath)) for i in range(start, n): print(i, n) # Get address searchresult = cg.address(street=geodata['street'].iloc[i], city=geodata['city'].iloc[i], state=geodata['state'].iloc[i], zipcode=geodata['zip'].iloc[i]) percentage = round((i/n), 5) sys.stdout.write("\033[F") # Cursor up one line print('address {} downloading...\t{} done'.format(i, percentage)) # If address search result empty, continue to next, else, save info if searchresult is []: # Info print('\t\n no results found! Retrying downloading ...') # wait 3 sec and retry, if then still empty, continue timer.sleep(3) searchresult = cg.address(street=geodata['street'].iloc[i], city=geodata['city'].iloc[i], state=geodata['state'].iloc[i], zipcode=geodata['zip'].iloc[i])
def listing_info(url_list): count = 0 Buildings = [] for link in url_list: count += 1 site_facts = {} url = "{}".format(link) # Puts the list link in the loop r = requests.get(url, headers=headers) page_soup = bs(r.content, features="html.parser") id_array = url.split( '/') # Split url to get trailing digits for Primary Key site_facts['CS_ID'] = "LN-" + id_array[-2] site_facts['url'] = url # Adds the url to the dictonary loc = page_soup.find( "h1", class_="breadcrumbs__crumb breadcrumbs__crumb-title" ) # Finds the address on page. try: #If location doesn't have address, go to next item) loc = loc.get_text() except Exception as err: continue check = loc[-5:].isdigit( ) #Checks to see if the postal code is in the address #TODO change this to use .split() if check: a1 = loc.split(", ") # Get AddressLine site_facts['Address_Line'] = a1[0] # Get City site_facts['City'] = a1[1] # Get State site_facts['State'] = a1[2][0:2] # Get Zip site_facts['Postal_Code'] = a1[2][-5:] geocode = cg.address(street=site_facts['Address_Line'], city=site_facts['City'], state=site_facts['State'], zipcode=site_facts['Postal_Code']) try: GEOID = geocode[0]['geographies']['2020 Census Blocks'][0][ 'GEOID'][0:12] site_facts['bg_geo_id'] = GEOID print(count, site_facts['Address_Line'], site_facts['City'], GEOID) except Exception as err: site_facts['bg_geo_id'] = None pass else: site_facts['Address_Line'] = loc site_facts["City"] = "N/A" site_facts['State'] = "N/A" site_facts['Postal_Code'] = "N/A" site_facts['bg_geo_id'] = None is_column = bool( page_soup.find("div", { "class": "property-facts__labels-one-col" })) # Test to see how the data is formated on the listing page. if is_column == True: # This loop is used when the listing uses columns. ### Temp lists to store property information property_label = [] property_data = [] labels = page_soup.find("div", { "class": "property-facts__labels-one-col" }).find_all('div', recursive=False ) # Selects the child of the correct label column datas = page_soup.find("div", { "class": "property-facts__data-one-col" }).find_all('div', recursive=False ) # Selects the child of the correct data column # These loops isolate the text from the html and put them into lists. for label in labels: property_label.append( re.sub(r"[\n\r\t]*", "", label.get_text())) for data in datas: # This loop gets the data information property_data.append(re.sub(r"[\n\r\t]*", "", data.get_text( ))) # This removes tabs, newlines and returns temp_dict1 = dict(zip( property_label, property_data)) # Creates dictionary of lists ## This section grabs info from the dictionary for each listing # Get Property Type site_facts['Property_Type'] = temp_dict1.get( 'Property Type', 'N/A') # Get price site_facts['Price'] = temp_dict1.get('Price', None) if site_facts['Price'] == None: pass elif '-' in site_facts['Price']: site_facts['Price'] = None else: temprice = site_facts['Price'] temprice2 = temprice.replace(',', '') temprice3 = temprice2.strip('$') site_facts['Price'] = int(temprice3) # Get Square Foot site_facts['SquareFeet'] = temp_dict1.get('Building Size', None) if site_facts['SquareFeet'] == None: pass else: tempft = site_facts['SquareFeet'] tempft2 = tempft.replace(',', '') tempft3 = tempft2.strip('SF') site_facts['SquareFeet'] = int(tempft3) # Get Building Class site_facts['Building_Class'] = temp_dict1.get( 'Building Class', 'N/A') # Get Year Built if 'Year Built' in temp_dict1: site_facts['Year_Built'] = temp_dict1['Year Built'] elif 'Year Built/Renovated' in temp_dict1: site_facts['Year_Built'] = temp_dict1['Year Built/Renovated'] else: site_facts['Year_Built'] = "N/A" # #Get Parking spots # if 'Parking' in temp_dict1: # site_facts['Parking_Ratio'] = temp_dict1['Parking'] # elif 'Parking Ratio' in temp_dict1: # site_facts['Parking_Ratio'] = temp_dict1['Parking Ratio'] # else: # site_facts['Parking_Ratio'] = 'N/A' # Get Sale Type site_facts['Sale_Type'] = temp_dict1.get('Sale Type', 'N/A') site_facts["Picture_url"] = "N/A" site_facts["Upload_Date"] = datetime.now().strftime("%Y-%m-%d") ## TODO Add currently listed to dictionary. How will we store the urls that are not active? site_facts["Currently_Listed"] = True site_facts["Sale_Leased"] = "Sale" Buildings.append( site_facts) #Append the this loop to the buildings list sleep(randint(5, 10)) if is_column == False: # This loop is used when the listing is in a table. table = page_soup.table table_data = table.find_all('td') t_list = [] temp_dict2 = {} for td in table_data: strip_td = (re.sub(r"[\n \r\t]*", "", td.get_text())) t_list.append(strip_td) temp_dict2 = { t_list[i]: t_list[i + 1] for i in range(0, len(t_list), 2) } # Turns the list into a dictionary # Get Property Type site_facts['Property_Type'] = temp_dict2.get('PropertyType', 'N/A') # Get price site_facts['Price'] = temp_dict2.get('Price', None) if site_facts['Price'] == None: pass elif '-' in site_facts['Price']: site_facts['Price'] = None else: temprice = site_facts['Price'] temprice2 = temprice.replace(',', '') temprice3 = temprice2.strip('$') site_facts['Price'] = int(temprice3) # Get Square Foot if 'BuildingSize' in temp_dict2: site_facts['SquareFeet'] = temp_dict2['BuildingSize'] if 'TotalBuildingSize' in temp_dict2: site_facts['SquareFeet'] = temp_dict2['TotalBuildingSize'] if 'UnitSize' in temp_dict2: site_facts['SquareFeet'] = temp_dict2['UnitSize'] if 'RentableBuildingArea' in temp_dict2: site_facts['SquareFeet'] = temp_dict2['RentableBuildingArea'] else: site_facts['SquareFeet'] = None if site_facts['SquareFeet'] == None: pass else: tempft = site_facts['SquareFeet'] tempft2 = tempft.replace(',', '') tempft3 = tempft2.strip('SF') site_facts['SquareFeet'] = int(tempft3) # Get Building Class site_facts['Building_Class'] = temp_dict2.get( 'BuildingClass', 'N/A') # Get Year Built if 'YearBuilt/Renovated' in temp_dict2: site_facts['Year_Built'] = temp_dict2['YearBuilt/Renovated'] elif 'YearBuilt' in temp_dict2: site_facts['Year_Built'] = temp_dict2['YearBuilt'] else: site_facts['Year_Built'] = "N/A" # #Get Parking info # if 'Parking' in temp_dict2: # site_facts['Parking_Ratio'] = temp_dict2['Parking'] # elif 'ParkingRatio' in temp_dict2: # site_facts['Parking_Ratio'] = temp_dict2['ParkingRatio'] # else: # site_facts['Parking_Ratio'] = 'N/A' # Get Sale Type site_facts['Sale_Type'] = temp_dict2.get('SaleType', 'N/A') site_facts["Picture_url"] = "N/A" site_facts["Upload_Date"] = datetime.now().strftime("%Y-%m-%d") site_facts["Currently_Listed"] = True site_facts["Sale_Leased"] = "Sale" Buildings.append( site_facts) # Add the site_Facts to the Buildings List sleep(randint(5, 10)) return Buildings
def get_info(df, address_col, state_col, zip_col, city_muni_col, subset_size, seed): ''' Accepts a data frame and four strings specifying columns names where the full address, state, zip code, and city/municipality are stored. Any rows with NAs in these columns are filtered. The address attributes are extracted, and passed to censusgeocode to retrieve GEOID, which is added as a new column. A subset of rows of specified size is used with the specified random seed. ''' # Filter out any rows with NA in address related column f_df = df.dropna(subset=[address_col, state_col, zip_col, city_muni_col]) #Select a random subset of rows to get addresses for #Dont need all of them for optimization subset_f_df = f_df.sample(n=subset_size, random_state=seed) #Initialize an empty list to collect the GEOIDs and latlon geoids = [] lats = [] lons = [] #Takes a long to run, use a counter to check progress total_rows = len(subset_f_df) rows_complete = 0 #iterate on each data frame row as a tuple, get address components for row in subset_f_df.itertuples(): full_add = str(getattr(row, address_col)) state = str(getattr(row, state_col)) zip_code = str(int(getattr(row, zip_col))) city = str(getattr(row, city_muni_col)) #Call censusgeocode for each row to get geoid result = cg.address(full_add, city=city, state=state, zipcode=zip_code) #If a match isn't found, geoid is np.NaN if len(result) == 0: geoid = np.NaN lat = np.NaN lon = np.NaN else: geoid = result[0]['geographies']['2010 Census Blocks'][0]['GEOID'][ 0:11] lat = result[0]['geographies']['2010 Census Blocks'][0]['INTPTLAT'] lon = result[0]['geographies']['2010 Census Blocks'][0]['INTPTLON'] #Clean the lat lon strings if lat[0] == '+': lat = float(lat[1:]) else: lat = float(lat) if lon[0] == '+': lon = float(lon[1:]) else: lon = float(lon) #Append to the frame geoids.append(geoid) lats.append(lat) lons.append(lon) rows_complete += 1 #Report progress prog = round(rows_complete / total_rows, 4) print(prog * 100, "%") #Append to the frame and return subset_f_df['GEOID'] = geoids subset_f_df['LATITUDE'] = lats subset_f_df['LONGITUDE'] = lons #Print the NaN Proportion nan_prop = subset_f_df['GEOID'].isna().sum() / total_rows print('Percent NaN GEOIDs:', nan_prop * 100, '%') #Return the dataframe return subset_f_df
df['FIPS'] = '' def clean_ids(x, desired_len): x = str(x) while len(x) < desired_len: x = "0"+ x return x for i in range(len(df)): address = df.iloc[i]['address'] state = df.iloc[i]['state'] city = df.iloc[i]['city'] result = cg.address(address, city=city, state=state) county_id = result[0]['geographies']['Counties'][0]['COUNTY'] state_id = result[0]['geographies']['States'][0]['STATE'] county_id = clean_ids(county_id, 3) state_id = clean_ids(state_id, 2) address_fips = state_id + county_id df.iloc[i]['FIPS'] = address_fips import plotly import plotly.express as px from plotly.offline import plot
def building_dict(url_list): buildings = [] progress = 0 for link in url_list: url = "{}".format(link) # Puts the list link in the loop r = requests.get(url, headers=headers) page_soup = bs(r.content, features="html.parser") # The titles in order are "Space", "Size", "Term", "Rate", "Space_Use", "Condition", "Available" units = page_soup.find_all( "ul", class_= "available-spaces__accordion-data no-margin js-as-column-width") progress += 1 counter = 1 # Used to ensure unique CS_ID print(progress, url) for item in units: site_facts = {} unit_temp = item.get_text("|", strip=True) units_txt = unit_temp.split("|") loc = page_soup.find( "h1", class_="breadcrumbs__crumb breadcrumbs__crumb-title" ) # Finds the address on page. try: # If location doesn't have address, go to next item) loc = loc.get_text() except Exception as err: continue check = loc[-5:].isdigit( ) # Checks to see if the postal code is in the address if check: a1 = loc.split(", ") # Get AddressLine site_facts['Address_Line'] = a1[0] # Get City site_facts['City'] = a1[1] # Get State site_facts['State'] = a1[2][0:2] # Get Zip site_facts['Postal_Code'] = a1[2][-5:] if units_txt[-4].endswith("/"): site_facts['Property_Type'] = units_txt[-4] + units_txt[-3] else: site_facts['Property_Type'] = units_txt[-3] geocode = cg.address(street=site_facts['Address_Line'], city=site_facts['City'], state=site_facts['State'], zipcode=site_facts['Postal_Code']) try: GEOID = geocode[0]['geographies']['2020 Census Blocks'][0][ 'GEOID'][0:12] site_facts['bg_geo_id'] = GEOID print(site_facts['Address_Line'], site_facts['City'], GEOID) except Exception as err: site_facts['bg_geo_id'] = None pass else: site_facts['Address_Line'] = loc site_facts["City"] = "N/A" site_facts['State'] = "N/A" site_facts['Postal_Code'] = "N/A" site_facts['bg_geo_id'] = None if units_txt[-4].endswith("/"): site_facts['Property_Type'] = units_txt[-4] + units_txt[-3] else: site_facts['Property_Type'] = units_txt[-3] site_facts['bg_geo_id'] = "N/A" # Gets the CS_ID and URL # id_array = url.split( '/') # Split url to get trailing digits for Primary Key site_facts['CS_ID'] = "LN-" + id_array[-2] + "-" + str(counter) site_facts['url'] = url # Adds the url to the dictonary # Price #site_facts['Price_month'] try: monthp_index = units_txt.index( "/MO" ) - 1 # Finds the element that matches "/MO" then goes one back to the numeric price month_price = units_txt[monthp_index].replace(',', '') if '.' in month_price: m_p = month_price.split('.') site_facts['Price_month'] = int(m_p[0].strip('$')) else: site_facts['Price_month'] = int(month_price.strip('$')) except ValueError as err: site_facts['Price_month'] = None # Price Per Year try: yearp_index = units_txt.index( "/YR" ) - 1 #Finds the element that matches "/YR" then goes one back to the numeric price year_price = units_txt[yearp_index].replace(',', '') site_facts['Price_year'] = int(year_price.strip('$')) except ValueError as err: site_facts['Price_year'] = None # Square Feet (size) if units_txt[1].endswith("-"): ft_int = units_txt[1].replace(',', '') site_facts['SquareFeet'] = int(ft_int.strip("-")) max_ft = units_txt[2].replace(',', '') site_facts['Expansion_SqrFt'] = int(max_ft.strip(' SF')) else: ft_int = units_txt[1].replace(',', '') site_facts['SquareFeet'] = int(ft_int.strip(' SF')) site_facts['Expansion_SqrFt'] = None # Space site_facts['Space'] = units_txt[0] # Condition if units_txt[-2] == '-': site_facts['Condition'] = 'Not Listed' else: site_facts['Condition'] = units_txt[-2] # Avalable site_facts['Available'] = units_txt[-1] # Term # Look to Expansion sqrft if site_facts['Expansion_SqrFt'] == None: site_facts['Term'] = units_txt[2] else: site_facts['Term'] = units_txt[3] # Upload_Date site_facts['Upload_Date'] = datetime.now().strftime("%Y-%m-%d") # Currently_Listed site_facts["Currently_Listed"] = True # Sale_Lease site_facts['Sale_Lease'] = "Lease" #Append to buildings buildings.append(site_facts) #Increase Counter counter += 1 sleep(randint(5, 10)) return buildings
def clean_ids(x, desired_len): x = str(x) while len(x) < desired_len: x = "0" + x return x fips['FIPSCd'] = fips['FIPSCd'].apply(lambda x: clean_ids(x, 5)) fips['STATEFP'] = fips['STATEFP'].apply(lambda x: clean_ids(x, 2)) fips['COUNTYFP'] = fips['COUNTYFP'].apply(lambda x: clean_ids(x, 3)) result = cg.address('75 Dayton Road', city='Redding', state='CT', zipcode='06896') county_id = result[0]['geographies']['Counties'][0]['COUNTY'] state_id = result[0]['geographies']['States'][0]['STATE'] county_id = clean_ids(county_id, 3) state_id = clean_ids(state_id, 2) address_fips = state_id + county_id print(address_fips) fips[fips.FIPSCd == address_fips].head() fips.head()