def geoclient_intersection(streetNumber, streetName, boroughName): ''' retrieve intersection street1 and street2 with main street''' g = Geoclient('799db7eb', '02b0bed977c344cb27b77e549eb69ed8') response_dict = {} dataGeo = g.address(streetNumber, streetName, boroughName) sideOfStreet = dataGeo['firstStreetNameNormalized'] fromStreet = dataGeo['highCrossStreetName1'] toStreet = dataGeo['lowCrossStreetName1'] borough = dataGeo['firstBoroughName'] response_dict.update({'sideOfStreet': sideOfStreet}) response_dict.update({'fromStreet': fromStreet}) response_dict.update({'toStreet': toStreet}) response_dict.update({'borough': borough}) return response_dict
def geoclientBatch(df, houseNo='houseNo', street='street', boro='boro'): ''' Uses DOITT's GeoClient (the web API to DCP's GeoSupport) via the python wrapper https://github.com/talos/nyc-geoclient to geocode a dataframe df with columns number, street, and boro. Returns the dataframe df with two additional columns: geocodedBBL and geocodedBIN ''' geoID = 'fb9ad04a' geoKey = '051f93e4125df4bae4f7c57517e62344' g = Geoclient(geoID, geoKey) warnings.filterwarnings('ignore') #do not display warnings def hitGeoC(df): try: x = g.address(df[houseNo], df[street], df[boro]) BBL = x['bbl'] BIN = x['buildingIdentificationNumber'] except: e = sys.exc_info()[0] BBL = ("Error: %s" % e) BIN = BBL return BBL, BIN df[['geocodedBBL', 'geocodedBIN']] = df.apply(hitGeoC, axis=1).apply(pd.Series) return df
def geoclientBatch(df, houseNo='houseNo', street='street', boro='boro'): ''' Uses DOITT's GeoClient (the web API to DCP's GeoSupport) via the python wrapper https://github.com/talos/nyc-geoclient to geocode a dataframe df with columns number, street, and boro. Returns the dataframe df with two additional columns: geocodedBBL and geocodedBIN ''' geoID = 'fb9ad04a' geoKey = '051f93e4125df4bae4f7c57517e62344' g = Geoclient(geoID, geoKey) warnings.filterwarnings('ignore') #do not display warnings def hitGeoC(df): # try to query the Geoclient API try: x = g.address(df[houseNo], df[street], df[boro]) # try to get BBL try: BBL = x['bbl'] # if there is a proxy error, display "---ProxyError---" except ProxyError: BBL = '---ProxyError---' # if there is any other error, display "---InvalidAddress---" except: BBL = '' # try to get BIN try: BIN = x['buildingIdentificationNumber'] # if there is a proxy error, display "---ProxyError---" except ProxyError: BIN = '---ProxyError---' # if there is any other error, display "---InvalidAddress---" except: BIN = '' # if there is a proxy error, display "---ProxyError---" for every value except ProxyError: error_message = '---ProxyError---' BBL = error_message BIN = error_message # if there is any other error, display "---InvalidAddress---" for every value except: error_message = '---InvalidAddress---' BBL = error_message BIN = error_message # return the geocoded columns return BBL, BIN # applies the "hitGeoC" function to every row in the DataFrame df[['geoBBL', 'geoBIN']] = df.apply(hitGeoC, axis=1).apply(pd.Series) return df
def main(): ###Instantiate geoclient wrapper g = Geoclient('7cb56bda', '51f262e341572a09e73aa32eb1dda793') ###Read in data bbl_to_nta = pd.read_csv('./../data/BBL_to_NTA.csv', dtype=str) file_names = make_file_names() bad_api_calls = {} for file in file_names: bad_api_calls[file] = {} if file[0:4] == '2010': skiprows_n = 3 else: skiprows_n = 4 boro_year_data = pd.read_excel('./../data/sales_data/{}'.format(file), sheetname=0, skiprows=range(skiprows_n)) ###Clean and make new features boro_year_data = clean_sales_dataframe(boro_year_data) boro_year_data = make_new_sales_features(boro_year_data) ###First merge on BBL print 'Merging for year = {}, borough = {}'.format( *file.replace('.xls', '').split('_')) merged = pd.merge(boro_year_data, bbl_to_nta, on='BBL', how='left') ###Then use API to fill missed NTA_strings print 'Number missing NTA_strings after merge: ', sum( merged['NTA_string'].isnull()) for index, row in merged.iterrows(): if pd.isnull(row['NTA_string']): query_results = get_nta_name_through_api(row, g) merged.loc[index, 'NTA_string'] = query_results[0] if query_results[1] != 'No error': bad_api_calls[file][index] = query_results[1] print 'Number missing NTA_strings after API call: ', sum( merged['NTA_string'].isnull()) print 'Number of bad API calls: ', len(bad_api_calls[file]), '\n' ###Finally save NTA tagged data merged.to_csv('./../data/sales_data_nta_tagged/{}.csv'.format( file.replace('.xls', '')), index=True) with open('./../data/sales_data_nta_tagged/bad_api_calls.json', 'w') as outfile: json.dump(bad_api_calls, outfile)
with open('rfs.config.json') as conf: config = json.load(conf) DBNAME = config['DBNAME'] DBUSER = config['DBUSER'] # load necessary environment variables # set variables with following command: export SECRET_KEY="somesecretvalue" app_id = config['GEOCLIENT_APP_ID'] app_key = config['GEOCLIENT_APP_KEY'] # connect to postgres db engine = sql.create_engine('postgresql://{}@localhost:5432/{}'.format( DBUSER, DBNAME)) # get the geo data g = Geoclient(app_id, app_key) def get_loc(num, street, borough): geo = g.address(num, street, borough) try: lat = geo['latitude'] except KeyError: try: lat = geo['latitude'] except KeyError: lat = 'none' try: lon = geo['longitude'] except: lon = 'none'
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. """ from nyc_geoclient import Geoclient import os import sys import csv import time GEOCODE_FILE = sys.argv[1] APP_ID = os.environ['GEOCLIENT_APP_ID'] APP_KEY = os.environ['GEOCLIENT_APP_KEY'] g = Geoclient(APP_ID, APP_KEY) ERRORS = 0 PROCESSED = 0 def geocode(): with open(GEOCODE_FILE + '-geocoded.csv', 'a') as geocode_file: writer = csv.writer(geocode_file, delimiter="|") with open(GEOCODE_FILE, 'r') as f: csv_file = csv.reader(f, delimiter="|") for row in csv_file: id = row[0] house_number = row[1] street = row[2] zipcode = row[3] info = g.address_zip(house_number, street, zipcode)
import json import os import glob import sys from time import sleep # For each row in the TSV: # Geocode address # Write full json contents to a file # write the original data + school, lat and long to a second file if len(sys.argv) != 5: print "Error: %s Need App ID, App Key, borough, and sold/listed status to run" % sys.argv[0] sys.exit(1) g = Geoclient(sys.argv[1], sys.argv[2]) borough = sys.argv[3] status = sys.argv[4].lower() if status != 'sold' and status != 'listed': print "Error: %s Stats must be written as sold or listed" % sys.argv[4] sys.exit(1) # Name the files that will be output, based on their status of sold or listed path = borough + '_' + status + '.csv' path2 = borough + '_' + status + '.json' # For every file the directory for file in glob.glob(os.getcwd() + "/" + borough + "/" + "*_" + status +".tsv"): # Use to skip unnecessary lines in file
import csv from nyc_geoclient import Geoclient # Read csv with ID and key. # csv must be in same folder and be like the following # "appID","appKey" # "309245e","c45458765e3h8560erg898160" with open('app_id_nyc.csv', 'rb') as f: reader = csv.reader(f) id_key = map(tuple, reader) my_app_ID = id_key[1][0] my_app_key = id_key[1][1] g = Geoclient(my_app_ID, my_app_key) with open('../out/nyc_sales_clean.csv', 'rb') as f: reader = csv.reader(f) nyc_sales = map(tuple, reader) nrow_nyc = len(nyc_sales) with open("../out/coords_nyc_api.csv", "wb") as csv_file: writer = csv.writer(csv_file, delimiter=',') writer.writerow([ 'id_sale', 'lat', 'long', 'returned_street_name', 'returned_zip_code' ]) for i in range(1, len(nyc_sales)): print(str(i) + ' / ' + str(nrow_nyc)) # address(houseNumber, street, borough)
def __init__(self, app_id, key): # empty for now self._g = Geoclient(app_id, key)
class GeoHelper: def __init__(self, app_id, key): # empty for now self._g = Geoclient(app_id, key) def _find_id_column(self, col_list): ids_by_pref = [ 'BIN', 'BUILDINGIDENTIFICATIONNUMBER', 'GEOCODEDBIN', 'BBL', 'GEOCODEDBBL', 'ADDRESS' ] col_map = dict(zip(map(str.upper, col_list), col_list)) return next( (col_map[i] for i in ids_by_pref if i in map(str.upper, col_list))) def _inferredGeocoder(self, input_str): ''' Attempts to infer the format of the input provided for geocoding, either BIN, BBL, or Street Address (very rudimentary). Will return the complete geoclient object. ''' input_str = str(input_str).replace('.0', '') if input_str.isdigit(): if len(input_str) == 10: # BBL #print '10 digit number: attempting BBL' out = self._g.bbl(int(input_str[0]),int(input_str[1:6].lstrip("0")) , \ int(float(input_str[6:].lstrip("0")))) elif len(input_str) == 7: # BIN #print '7 digit number: attempting BIN' out = self._g.bin(input_str) else: #print 'Unrecognized number of digits, no ID possible' out = None else: # try to split addresses #print "Attempting to split text into house number / street name / borough." split = str.split(input_str) house_num = split[0] street_name = " ".join(split[1:-1]) boro_name = split[-1] try: out = self._g.address(house_num, street_name, boro_name) except: print 'Format not recognized' out = None return out def _checkGeoclientValidity(self, geoclient_output): if 'returnCode1a' in geoclient_output: if str(geoclient_output['returnCode1a'][0]) == '0': return True elif geoclient_output['message']: return 'Error Code: ' + str(geoclient_output['message']) else: return 'error returned with no message' def _addressGeocoder(df): ''' private function to make a generic call to NYC geoclientBatch. ''' try: x = self._g.address(df[house_num], df[street], df[boro]) BBL = x['bbl'] BIN = x['buildingIdentificationNumber'] except: e = _sys.exc_info()[0] BBL = ("Error: %s" % e) BIN = BBL return BBL, BIN def get_BINandBBL(self, df, identifier_col=None): ''' Uses DOITT's GeoClient (the web API to DCP's GeoSupport) via the python wrapper https://github.com/talos/nyc-geoclient to geocode a dataframe df with columns number, street, and boro. Returns the dataframe df with two additional columns: geocodedBBL and geocodedBIN ''' if identifier_col: print 'using provided ID column: ' + identifier_col else: identifier_col = self._find_id_column(df.columns) print 'found ID column: ' + identifier_col def wrapper_func(x): out = self._inferredGeocoder(x[identifier_col]) log = self._checkGeoclientValidity(out) if log is True: return out['bbl'], out['buildingIdentificationNumber'] else: return log, log df[['geocodedBBL', 'geocodedBIN']] = df.apply(lambda x: wrapper_func(x), axis=1).apply(_pd.Series) return df def GetLatLong(self, df, identifier_col=None): if identifier_col: print 'using provided ID column: ' + identifier_col else: identifier_col = self._find_id_column(df.columns) print 'found ID column: ' + identifier_col df['Latitude'] = _np.nan df['Longitude'] = _np.nan def wrapper_func(x): out = self._inferredGeocoder(x[identifier_col]) log = self._checkGeoclientValidity(out) if log is True: return out['latitudeInternalLabel'], out[ 'longitudeInternalLabel'] else: return log, log df[['Latitude', 'Longitude']] = df.apply(lambda x: wrapper_func(x), axis=1).apply(_pd.Series) return df
# coding: utf-8 # written for python 2 using nyc_geoclient # https://github.com/talos/nyc-geoclient import pandas as pd import numpy as np import os, re, time import sqlite3 as lite from nyc_geoclient import Geoclient # read-in NYC Geoclient API token token=open('NYC_Geoclient_token.txt') g=Geoclient(token.readline().strip('\n'), token.readline().strip('\n')) def clean_strings(x): try: return str(x).strip() except ValueError: return np.nan def parse_address(address): separators=['Apt','APT','#'] # gets rid of the apartments in the address for separator in separators: if address.find(separator)!=-1: no_apt=address.split(separator,1)[0] else: no_apt=address #apartments can also be indicated by comma followed by number with optional letter (ex: , 503C) street=re.split(r'(,\s\d+$|,\s\d+\w{1}$)', no_apt) street=street[0] # grab what came before the apartment
# Read CSV of rent stabilized properties and grab BBL from NYC's GeoClient API # takes an input CSV file name and output CSV file name as argv # first two columns of input csv must be address number and address name # hardcoded for manhattan only, will update in the future # run script by doing: python geo-client-api-test.py input.csv output.csv from sys import argv from nyc_geoclient import Geoclient import csv import json script, infile, outfile = argv g = Geoclient('9cd0a15f', '54dc84bcaca9ff4877da771750033275') #test = g.address('140-154', 'West 72nd Street', 'Manhattan') #print json.dumps(test, sort_keys=True) print "opening file: %s" % infile with open(infile, 'r') as i: reader = csv.reader(i) print "opening file: %s" % outfile with open(outfile, 'w') as o: writer = csv.writer(o, lineterminator='\n') all = [] row = next(reader, None) row.append('bbl') all.append(row)